Ejemplo n.º 1
0
def test_load_linnerud():
    res = load_linnerud()
    assert_equal(res.data.shape, (20, 3))
    assert_equal(res.target.shape, (20, 3))
    assert_equal(len(res.target_names), 3)
    assert_true(res.DESCR)

    # test return_X_y option
    X_y_tuple = load_linnerud(return_X_y=True)
    bunch = load_linnerud()
    assert_true(isinstance(X_y_tuple, tuple))
    assert_array_equal(X_y_tuple[0], bunch.data)
    assert_array_equal(X_y_tuple[1], bunch.target)
Ejemplo n.º 2
0
def test_pls_errors():
    d = load_linnerud()
    X = d.data
    Y = d.target
    for clf in [pls_.PLSCanonical(), pls_.PLSRegression(), pls_.PLSSVD()]:
        clf.n_components = 4
        assert_raise_message(ValueError, "Invalid number of components", clf.fit, X, Y)
Ejemplo n.º 3
0
def test_convergence_fail():
    d = load_linnerud()
    X = d.data
    Y = d.target
    pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1],
                                     max_iter=2, tol=1e-10)
    assert_warns(ConvergenceWarning, pls_bynipals.fit, X, Y)
Ejemplo n.º 4
0
def test_eigsym():

    d = load_linnerud()
    X = d.data

    n = 3
    X = dot(X.T, X)

    eig = EIGSym(num_comp = n, tolerance = 5e-12)
    eig.fit(X)
    Xhat = dot(eig.V, dot(eig.D, eig.V.T))

    assert_array_almost_equal(X, Xhat, decimal=4, err_msg="EIGSym does not" \
            " give the correct reconstruction of the matrix")

    [D,V] = np.linalg.eig(X)
    # linalg.eig does not return the eigenvalues in order, so need to sort
    idx = np.argsort(D, axis=None).tolist()[::-1]
    D = D[idx]
    V = V[:,idx]

    Xhat = dot(V, dot(np.diag(D), V.T))

    V, eig.V = direct(V, eig.V, compare = True)
    assert_array_almost_equal(V, eig.V, decimal=5, err_msg="EIGSym does not" \
            " give the correct eigenvectors")
Ejemplo n.º 5
0
def test_scale():

    d = load_linnerud()
    X = d.data
    Y = d.target

    # causes X[:, -1].std() to be zero
    X[:, -1] = 1.0
Ejemplo n.º 6
0
def test_scale():
    d = load_linnerud()
    X = d.data
    Y = d.target

    # causes X[:, -1].std() to be zero
    X[:, -1] = 1.0

    for clf in [pls.PLSCanonical(), pls.PLSRegression(), pls.CCA(), pls.PLSSVD()]:
        clf.set_params(scale=True)
        clf.fit(X, Y)
Ejemplo n.º 7
0
def test_load_linnerud():
    res = load_linnerud()
    assert_equal(res.data.shape, (20, 3))
    assert_equal(res.target.shape, (20, 3))
    assert_equal(len(res.target_names), 3)
    assert_true(res.DESCR)
    assert_true(os.path.exists(res.data_filename))
    assert_true(os.path.exists(res.target_filename))

    # test return_X_y option
    check_return_X_y(res, partial(load_linnerud))
Ejemplo n.º 8
0
def test_PLSSVD():
    # Let's check the PLSSVD doesn't return all possible component but just
    # the specified number
    d = load_linnerud()
    X = d.data
    Y = d.target
    n_components = 2
    for clf in [pls_.PLSSVD, pls_.PLSRegression, pls_.PLSCanonical]:
        pls = clf(n_components=n_components)
        pls.fit(X, Y)
        assert_equal(n_components, pls.y_scores_.shape[1])
Ejemplo n.º 9
0
def test_univariate_pls_regression():
    # Ensure 1d Y is correctly interpreted
    d = load_linnerud()
    X = d.data
    Y = d.target

    clf = pls_.PLSRegression()
    # Compare 1d to column vector
    model1 = clf.fit(X, Y[:, 0]).coef_
    model2 = clf.fit(X, Y[:, :1]).coef_
    assert_array_almost_equal(model1, model2)
Ejemplo n.º 10
0
def test_scale_and_stability():
    # We test scale=True parameter
    # This allows to check numerical stability over platforms as well

    d = load_linnerud()
    X1 = d.data
    Y1 = d.target
    # causes X[:, -1].std() to be zero
    X1[:, -1] = 1.0

    # From bug #2821
    # Test with X2, T2 s.t. clf.x_score[:, 1] == 0, clf.y_score[:, 1] == 0
    # This test robustness of algorithm when dealing with value close to 0
    X2 = np.array([[0., 0., 1.],
                   [1., 0., 0.],
                   [2., 2., 2.],
                   [3., 5., 4.]])
    Y2 = np.array([[0.1, -0.2],
                   [0.9, 1.1],
                   [6.2, 5.9],
                   [11.9, 12.3]])

    for (X, Y) in [(X1, Y1), (X2, Y2)]:
        X_std = X.std(axis=0, ddof=1)
        X_std[X_std == 0] = 1
        Y_std = Y.std(axis=0, ddof=1)
        Y_std[Y_std == 0] = 1

        X_s = (X - X.mean(axis=0)) / X_std
        Y_s = (Y - Y.mean(axis=0)) / Y_std

        for clf in [CCA(), pls_.PLSCanonical(), pls_.PLSRegression(),
                    pls_.PLSSVD()]:
            clf.set_params(scale=True)
            X_score, Y_score = clf.fit_transform(X, Y)
            clf.set_params(scale=False)
            X_s_score, Y_s_score = clf.fit_transform(X_s, Y_s)
            assert_array_almost_equal(X_s_score, X_score)
            assert_array_almost_equal(Y_s_score, Y_score)
            # Scaling should be idempotent
            clf.set_params(scale=True)
            X_score, Y_score = clf.fit_transform(X_s, Y_s)
            assert_array_almost_equal(X_s_score, X_score)
            assert_array_almost_equal(Y_s_score, Y_score)
Ejemplo n.º 11
0
    def load_linnerud():
        from sklearn.datasets import load_linnerud
        linnerud = load_linnerud()

        # print(linnerud.DESCR)

        print(linnerud.keys())

        # print(linnerud.feature_names)
        # Chins     : 懸垂の回数
        # Situps    : 腹筋の回数
        # Jumps     : 跳躍

        # print(linnerud.target_names)
        # ['Weight', 'Waist', 'Pulse']

        X = linnerud.data
        y = linnerud.target
        return SklearnDataGenerator.shuffle(X, y)
Ejemplo n.º 12
0
def test_predict_transform_copy():
    # check that the "copy" keyword works
    d = load_linnerud()
    X = d.data
    Y = d.target
    clf = pls_.PLSCanonical()
    X_copy = X.copy()
    Y_copy = Y.copy()
    clf.fit(X, Y)
    # check that results are identical with copy
    assert_array_almost_equal(clf.predict(X), clf.predict(X.copy(), copy=False))
    assert_array_almost_equal(clf.transform(X), clf.transform(X.copy(), copy=False))

    # check also if passing Y
    assert_array_almost_equal(clf.transform(X, Y), clf.transform(X.copy(), Y.copy(), copy=False))
    # check that copy doesn't destroy
    # we do want to check exact equality here
    assert_array_equal(X_copy, X)
    assert_array_equal(Y_copy, Y)
    # also check that mean wasn't zero before (to make sure we didn't touch it)
    assert_true(np.all(X.mean(axis=0) != 0))
Ejemplo n.º 13
0
from sklearn.datasets import load_linnerud
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from MyMultiOutputRegressor.MultiOutputRegressor import *

inputs, outputs = load_linnerud(return_X_y=True)
trainInputs, validationInputs, trainOutputs, validationOutputs = train_test_split(
    inputs, outputs, test_size=0.20, random_state=1)

scaler = StandardScaler()
scaler.fit(trainInputs)
trainInputs = scaler.transform(trainInputs)
validationInputs = scaler.transform(validationInputs)

scaler.fit(trainOutputs)
trainOutputs = scaler.transform(trainOutputs)
validationOutputs = scaler.transform(validationOutputs)

print("------------------------sklearn multioutput regressor----------------")
model = MultiOutputRegressor(Ridge(random_state=1)).fit(
    trainInputs, trainOutputs)
predictedOutputs = model.predict(validationInputs)
error = mean_squared_error(validationOutputs, predictedOutputs)

print(model.estimators_[0].intercept_, model.estimators_[0].coef_)
print(model.estimators_[1].intercept_, model.estimators_[1].coef_)
print(model.estimators_[2].intercept_, model.estimators_[2].coef_)
print('prediction error', error)
Ejemplo n.º 14
0
# Import necessary library
import numpy as np
import pandas as pd

# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error as mse
from sklearn.linear_model import LinearRegression


# TODO: split the data into training and testing sets,
# using the standard settings for train_test_split.
# Then, train and test the classifiers with your newly split data instead of X and y.

#Solution
# Prepare the data as features and labels.
features = X
labels = y

# split the data into training and testing sets
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.4, random_state=0)


# Create decision tree regressor/algorithm object
Ejemplo n.º 15
0
This example introduces the Regressor object in a
multi-target regression task.
"""

# Author: Alex Wozniakowski <*****@*****.**>

import pandas as pd

from sklearn.datasets import load_linnerud
from sklearn.model_selection import train_test_split

from physlearn import Regressor

# Load the data from Sklearn
bunch = load_linnerud(as_frame=True)  # returns a Bunch instance
X, y = bunch['data'], bunch['target']

# Split the data, using the default test_size=0.25.
# X_train has shape (15, 3), y_train has shape (15, 3)
# X_test has shape (5, 3), and y_test has shape (5, 3).
# Namely, there are 3 features and 3 single-target regression subtasks.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Choose the underlying regressor to be the Sklearn
# histogram-based gradient boosting regressor.
regressor_choice = 'HistGradientBoostingRegressor'

# Choose the Sklearn QuantileTransformer as the data preprocessor.
# The output distribution is the Gaussian, e.g., 'normal'.
# The number of quantiles is the number of examples in y_train,
Ejemplo n.º 16
0
def load_UCI_dataset(dsIn):
    '''Loads a UCI dataset

    :param dsIn: the dataset name
    :return: A SciKit dataset
    '''

    from sklearn import datasets
    allDSets = {"iris":datasets.load_iris(), "boston":datasets.load_boston(), "diabetes":datasets.load_diabetes(), " linnerud":datasets.load_linnerud()}
    dataset = allDSets[dsIn]
    return dataset
Ejemplo n.º 17
0
def test_pls():
    d = load_linnerud()
    X = d.data
    Y = d.target
    # 1) Canonical (symmetric) PLS (PLS 2 blocks canonical mode A)
    # ===========================================================
    # Compare 2 algo.: nipals vs. svd
    # ------------------------------
    pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1])
    pls_bynipals.fit(X, Y)
    pls_bysvd = pls_.PLSCanonical(algorithm="svd", n_components=X.shape[1])
    pls_bysvd.fit(X, Y)
    # check equalities of loading (up to the sign of the second column)
    assert_array_almost_equal(
        pls_bynipals.x_loadings_,
        pls_bysvd.x_loadings_,
        decimal=5,
        err_msg="nipals and svd implementations lead to different x loadings")

    assert_array_almost_equal(
        pls_bynipals.y_loadings_,
        pls_bysvd.y_loadings_,
        decimal=5,
        err_msg="nipals and svd implementations lead to different y loadings")

    # Check PLS properties (with n_components=X.shape[1])
    # ---------------------------------------------------
    plsca = pls_.PLSCanonical(n_components=X.shape[1])
    plsca.fit(X, Y)
    T = plsca.x_scores_
    P = plsca.x_loadings_
    Wx = plsca.x_weights_
    U = plsca.y_scores_
    Q = plsca.y_loadings_
    Wy = plsca.y_weights_

    def check_ortho(M, err_msg):
        K = np.dot(M.T, M)
        assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg)

    # Orthogonality of weights
    # ~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(Wx, "x weights are not orthogonal")
    check_ortho(Wy, "y weights are not orthogonal")

    # Orthogonality of latent scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(T, "x scores are not orthogonal")
    check_ortho(U, "y scores are not orthogonal")

    # Check X = TP' and Y = UQ' (with (p == q) components)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # center scale X, Y
    Xc, Yc, x_mean, y_mean, x_std, y_std =\
        pls_._center_scale_xy(X.copy(), Y.copy(), scale=True)
    assert_array_almost_equal(Xc, np.dot(T, P.T), err_msg="X != TP'")
    assert_array_almost_equal(Yc, np.dot(U, Q.T), err_msg="Y != UQ'")

    # Check that rotations on training data lead to scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Xr = plsca.transform(X)
    assert_array_almost_equal(Xr,
                              plsca.x_scores_,
                              err_msg="rotation on X failed")
    Xr, Yr = plsca.transform(X, Y)
    assert_array_almost_equal(Xr,
                              plsca.x_scores_,
                              err_msg="rotation on X failed")
    assert_array_almost_equal(Yr,
                              plsca.y_scores_,
                              err_msg="rotation on Y failed")

    # Check that inverse_transform works
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Xreconstructed = plsca.inverse_transform(Xr)
    assert_array_almost_equal(Xreconstructed,
                              X,
                              err_msg="inverse_transform failed")

    # "Non regression test" on canonical PLS
    # --------------------------------------
    # The results were checked against the R-package plspm
    pls_ca = pls_.PLSCanonical(n_components=X.shape[1])
    pls_ca.fit(X, Y)

    x_weights = np.array([[-0.61330704, 0.25616119, -0.74715187],
                          [-0.74697144, 0.11930791, 0.65406368],
                          [-0.25668686, -0.95924297, -0.11817271]])
    # x_weights_sign_flip holds columns of 1 or -1, depending on sign flip
    # between R and python
    x_weights_sign_flip = pls_ca.x_weights_ / x_weights

    x_rotations = np.array([[-0.61330704, 0.41591889, -0.62297525],
                            [-0.74697144, 0.31388326, 0.77368233],
                            [-0.25668686, -0.89237972, -0.24121788]])
    x_rotations_sign_flip = pls_ca.x_rotations_ / x_rotations

    y_weights = np.array([[+0.58989127, 0.7890047, 0.1717553],
                          [+0.77134053, -0.61351791, 0.16920272],
                          [-0.23887670, -0.03267062, 0.97050016]])
    y_weights_sign_flip = pls_ca.y_weights_ / y_weights

    y_rotations = np.array([[+0.58989127, 0.7168115, 0.30665872],
                            [+0.77134053, -0.70791757, 0.19786539],
                            [-0.23887670, -0.00343595, 0.94162826]])
    y_rotations_sign_flip = pls_ca.y_rotations_ / y_rotations

    # x_weights = X.dot(x_rotation)
    # Hence R/python sign flip should be the same in x_weight and x_rotation
    assert_array_almost_equal(x_rotations_sign_flip, x_weights_sign_flip)
    # This test that R / python give the same result up to column
    # sign indeterminacy
    assert_array_almost_equal(np.abs(x_rotations_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)

    assert_array_almost_equal(y_rotations_sign_flip, y_weights_sign_flip)
    assert_array_almost_equal(np.abs(y_rotations_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)

    # 2) Regression PLS (PLS2): "Non regression test"
    # ===============================================
    # The results were checked against the R-packages plspm, misOmics and pls
    pls_2 = pls_.PLSRegression(n_components=X.shape[1])
    pls_2.fit(X, Y)

    x_weights = np.array([[-0.61330704, -0.00443647, 0.78983213],
                          [-0.74697144, -0.32172099, -0.58183269],
                          [-0.25668686, 0.94682413, -0.19399983]])
    x_weights_sign_flip = pls_2.x_weights_ / x_weights

    x_loadings = np.array([[-0.61470416, -0.24574278, 0.78983213],
                           [-0.65625755, -0.14396183, -0.58183269],
                           [-0.51733059, 1.00609417, -0.19399983]])
    x_loadings_sign_flip = pls_2.x_loadings_ / x_loadings

    y_weights = np.array([[+0.32456184, 0.29892183, 0.20316322],
                          [+0.42439636, 0.61970543, 0.19320542],
                          [-0.13143144, -0.26348971, -0.17092916]])
    y_weights_sign_flip = pls_2.y_weights_ / y_weights

    y_loadings = np.array([[+0.32456184, 0.29892183, 0.20316322],
                           [+0.42439636, 0.61970543, 0.19320542],
                           [-0.13143144, -0.26348971, -0.17092916]])
    y_loadings_sign_flip = pls_2.y_loadings_ / y_loadings

    # x_loadings[:, i] = Xi.dot(x_weights[:, i]) \forall i
    assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4)
    assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)

    assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4)
    assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)

    # 3) Another non-regression test of Canonical PLS on random dataset
    # =================================================================
    # The results were checked against the R-package plspm
    n = 500
    p_noise = 10
    q_noise = 5
    # 2 latents vars:
    rng = check_random_state(11)
    l1 = rng.normal(size=n)
    l2 = rng.normal(size=n)
    latents = np.array([l1, l1, l2, l2]).T
    X = latents + rng.normal(size=4 * n).reshape((n, 4))
    Y = latents + rng.normal(size=4 * n).reshape((n, 4))
    X = np.concatenate((X, rng.normal(size=p_noise * n).reshape(n, p_noise)),
                       axis=1)
    Y = np.concatenate((Y, rng.normal(size=q_noise * n).reshape(n, q_noise)),
                       axis=1)

    pls_ca = pls_.PLSCanonical(n_components=3)
    pls_ca.fit(X, Y)

    x_weights = np.array([[0.65803719, 0.19197924, 0.21769083],
                          [0.7009113, 0.13303969, -0.15376699],
                          [0.13528197, -0.68636408, 0.13856546],
                          [0.16854574, -0.66788088, -0.12485304],
                          [-0.03232333, -0.04189855, 0.40690153],
                          [0.1148816, -0.09643158, 0.1613305],
                          [0.04792138, -0.02384992, 0.17175319],
                          [-0.06781, -0.01666137, -0.18556747],
                          [-0.00266945, -0.00160224, 0.11893098],
                          [-0.00849528, -0.07706095, 0.1570547],
                          [-0.00949471, -0.02964127, 0.34657036],
                          [-0.03572177, 0.0945091, 0.3414855],
                          [0.05584937, -0.02028961, -0.57682568],
                          [0.05744254, -0.01482333, -0.17431274]])
    x_weights_sign_flip = pls_ca.x_weights_ / x_weights

    x_loadings = np.array([[0.65649254, 0.1847647, 0.15270699],
                           [0.67554234, 0.15237508, -0.09182247],
                           [0.19219925, -0.67750975, 0.08673128],
                           [0.2133631, -0.67034809, -0.08835483],
                           [-0.03178912, -0.06668336, 0.43395268],
                           [0.15684588, -0.13350241, 0.20578984],
                           [0.03337736, -0.03807306, 0.09871553],
                           [-0.06199844, 0.01559854, -0.1881785],
                           [0.00406146, -0.00587025, 0.16413253],
                           [-0.00374239, -0.05848466, 0.19140336],
                           [0.00139214, -0.01033161, 0.32239136],
                           [-0.05292828, 0.0953533, 0.31916881],
                           [0.04031924, -0.01961045, -0.65174036],
                           [0.06172484, -0.06597366, -0.1244497]])
    x_loadings_sign_flip = pls_ca.x_loadings_ / x_loadings

    y_weights = np.array([[0.66101097, 0.18672553, 0.22826092],
                          [0.69347861, 0.18463471, -0.23995597],
                          [0.14462724, -0.66504085, 0.17082434],
                          [0.22247955, -0.6932605, -0.09832993],
                          [0.07035859, 0.00714283, 0.67810124],
                          [0.07765351, -0.0105204, -0.44108074],
                          [-0.00917056, 0.04322147, 0.10062478],
                          [-0.01909512, 0.06182718, 0.28830475],
                          [0.01756709, 0.04797666, 0.32225745]])
    y_weights_sign_flip = pls_ca.y_weights_ / y_weights

    y_loadings = np.array([[0.68568625, 0.1674376, 0.0969508],
                           [0.68782064, 0.20375837, -0.1164448],
                           [0.11712173, -0.68046903, 0.12001505],
                           [0.17860457, -0.6798319, -0.05089681],
                           [0.06265739, -0.0277703, 0.74729584],
                           [0.0914178, 0.00403751, -0.5135078],
                           [-0.02196918, -0.01377169, 0.09564505],
                           [-0.03288952, 0.09039729, 0.31858973],
                           [0.04287624, 0.05254676, 0.27836841]])
    y_loadings_sign_flip = pls_ca.y_loadings_ / y_loadings

    assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4)
    assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4)

    assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4)
    assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4)

    # Orthogonality of weights
    # ~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(pls_ca.x_weights_, "x weights are not orthogonal")
    check_ortho(pls_ca.y_weights_, "y weights are not orthogonal")

    # Orthogonality of latent scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(pls_ca.x_scores_, "x scores are not orthogonal")
    check_ortho(pls_ca.y_scores_, "y scores are not orthogonal")

    # 4) Another "Non regression test" of PLS Regression (PLS2):
    #    Checking behavior when the first column of Y is constant
    # ===============================================
    # The results were compared against a modified version of plsreg2
    # from the R-package plsdepot
    X = d.data
    Y = d.target
    Y[:, 0] = 1
    pls_2 = pls_.PLSRegression(n_components=X.shape[1])
    pls_2.fit(X, Y)

    x_weights = np.array([[-0.6273573, 0.007081799, 0.7786994],
                          [-0.7493417, -0.277612681, -0.6011807],
                          [-0.2119194, 0.960666981, -0.1794690]])
    x_weights_sign_flip = pls_2.x_weights_ / x_weights

    x_loadings = np.array([[-0.6273512, -0.22464538, 0.7786994],
                           [-0.6643156, -0.09871193, -0.6011807],
                           [-0.5125877, 1.01407380, -0.1794690]])
    x_loadings_sign_flip = pls_2.x_loadings_ / x_loadings

    y_loadings = np.array([[0.0000000, 0.0000000, 0.0000000],
                           [0.4357300, 0.5828479, 0.2174802],
                           [-0.1353739, -0.2486423, -0.1810386]])

    # R/python sign flip should be the same in x_weight and x_rotation
    assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4)

    # This test that R / python give the same result up to column
    # sign indeterminacy
    assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)

    # For the PLSRegression with default parameters, it holds that
    # y_loadings==y_weights. In this case we only test that R/python
    # give the same result for the y_loadings irrespective of the sign
    assert_array_almost_equal(np.abs(pls_2.y_loadings_), np.abs(y_loadings), 4)
Ejemplo n.º 18
0
######################### import stuff ##########################
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.datasets import load_linnerud
from sklearn.model_selection import train_test_split

######################## prepare the data ########################
X, y = load_linnerud(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=5)

######################## set learning variables ##################
learning_rate = 0.0005
epochs = 2000
batch_size = 3

######################## set some variables #######################
x = tf.placeholder(tf.float32, [None, 3], name='x')  # 3 features
y = tf.placeholder(tf.float32, [None, 3], name='y')  # 3 outputs

# hidden layer 1
W1 = tf.Variable(tf.truncated_normal([3, 10], stddev=0.03), name='W1')
b1 = tf.Variable(tf.truncated_normal([10]), name='b1')

# hidden layer 2
W2 = tf.Variable(tf.truncated_normal([10, 3], stddev=0.03), name='W2')
b2 = tf.Variable(tf.truncated_normal([3]), name='b2')
Ejemplo n.º 19
0
def create_linnerud():
    linnerud_data = datasets.load_linnerud()
    assert False
    assert sm.var_test_scaler.mean_.all() == np.array([[7.8, 154.8,
                                                        104.4]]).all()
    assert sm.var_test_scaler.var_.all() == np.array(
        [[34.16, 5246.16, 6053.44]]).all()
    assert sm.obj_train_scaler.mean_.all() == np.array(
        [[173.06666667, 34.66666667, 56.53333333]]).all()
    assert sm.obj_train_scaler.var_.all() == np.array(
        [[341.79555556, 4.35555556, 58.38222222]]).all()
    assert sm.obj_test_scaler.mean_.all() == np.array([[195.2, 37.6,
                                                        54.8]]).all()
    assert sm.obj_test_scaler.var_.all() == np.array([[923.76, 19.44,
                                                       20.16]]).all()


model = tm.Surrogate_Models()
variables, objectives = datasets.load_linnerud(return_X_y=True)
model.random = 57757
model.update_database(np.ndarray.tolist(variables),
                      np.ndarray.tolist(objectives))
model._initialize_models()


def test_initialize_models():
    models = model.models
    assert 'lr' in models
    assert 'pr' in models
    assert 'mars' in models
    assert 'gpr' in models
    assert 'ann' in models
    assert 'rf' in models
Ejemplo n.º 21
0
#  [ 10.73412075 166.87684314  85.9018666  174.46486234  34.77098762
#    56.57285616]
#  [ 18.69314353 299.06132114 182.60275097 148.83514259  30.87234812
#    59.50363423]
#  [  7.65798496 115.78798285  48.52729701 184.37066362  36.2777988
#    55.4401202 ]
#  [ 13.25814681 208.79619397 116.56838932 166.33696994  33.5346213
#    57.50228687]
#  [ 13.26989716 208.9913453  116.71115423 166.29913135  33.52886552
#    57.50661375]
#  [  4.69419072  66.56490544  12.51765966 193.91470161  37.7295807
#    54.34875215]
#  [  5.34592844  77.38904966  20.43617123 191.8159697   37.41033417
#    54.58874375]
#  [ 13.13301791 206.71803705 115.04809272 166.7399112   33.59591431
#    57.45621023]
#  [  4.9236322   70.37549919  15.30533784 193.175852    37.61719133
#    54.43324017]
#  [ 13.10738592 206.29233749 114.73666792 166.8224516   33.60846986
#    57.44677168]
#  [ 12.73830285 200.16255828 110.25236629 168.01097635  33.78926113
#    57.31086296]
#  [  7.62722272 115.27707959  48.15354059 184.46972448  36.29286734
#    55.42879251]]

# 5)
# 0.78

ll = load_linnerud()
print(ll.DESCR)
Ejemplo n.º 22
0
def scikitAlgorithms_UCIDataset(input_dict):
    from sklearn import datasets
    allDSets = {"iris":datasets.load_iris(), "boston":datasets.load_boston(), "diabetes":datasets.load_diabetes(), " linnerud":datasets.load_linnerud()}
    dataset = allDSets[input_dict['dsIn']]
    output_dict = {}
    output_dict['dtsOut'] = dataset#(dataset.data, dataset.target)
    return output_dict
Ejemplo n.º 23
0
def run_4(feature_to_plot):
    # Generate some 2D coefficients with sine waves with random frequency and phase
    from sklearn.datasets import load_linnerud
    linnerud = load_linnerud()
    """
    print(linnerud.feature_names)
    print(linnerud.data)

    print(linnerud.target_names)
    print(linnerud.target)
    """
    X, Y = linnerud.data, linnerud.target

    # [print(y) for y in Y.T]

    coef_ridge_ = np.array([Ridge(alpha=0.5).fit(X, y).coef_ for y in Y.T])
    coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T])
    coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_

    coef_low_ranked_ = low_ranked_regression(X, Y, 3)

    # #############################################################################
    # Plot support and time series
    fig = plt.figure(figsize=(8, 5))
    plt.subplot(1, 2, 1)
    plt.spy(coef_lasso_)
    plt.xlabel('Feature')
    plt.ylabel('Time (or Task)')
    plt.text(10, 5, 'Lasso')
    plt.subplot(1, 2, 2)
    plt.spy(coef_multi_task_lasso_)
    plt.xlabel('Feature')
    plt.ylabel('Time (or Task)')
    plt.text(10, 5, 'MultiTaskLasso')
    fig.suptitle('Coefficient non-zero location')

    plt.tight_layout()
    plt.figure()
    lw = 1
    """
    plt.plot(Y[:, feature_to_plot], color='seagreen', linewidth=lw,
             label='Ground truth')
    """
    plt.plot(coef_lasso_[:, feature_to_plot],
             color='cornflowerblue',
             linewidth=lw,
             label='Lasso')
    plt.plot(coef_ridge_[:, feature_to_plot],
             color='red',
             linewidth=lw,
             label='Ridge')
    plt.plot(coef_low_ranked_[:, feature_to_plot],
             color='magenta',
             linewidth=lw,
             label='LowRanked')
    plt.plot(coef_multi_task_lasso_[:, feature_to_plot],
             color='gold',
             linewidth=lw,
             label='MultiTaskLasso')
    plt.legend(loc='upper center')
    plt.axis('tight')
    plt.ylim([-1.1, 1.1])
    plt.tight_layout()
    plt.show()
Ejemplo n.º 24
0
print()

print('--------------------------乳腺癌数据集------------------------')

from sklearn.datasets import load_breast_cancer
breast_cancer = load_breast_cancer()
print('简单经典的用于二分类任务的数据集')
print('数据属性', breast_cancer.keys())

print()
print('其他数据集')
from sklearn.datasets import load_boston
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_digits
from sklearn.datasets import load_linnerud
from sklearn.datasets import load_wine
print('波士顿房价数据集load-boston是经典的用于回归任务的数据集')
print('波士顿房价数据集的数据属性为', load_boston().keys())
print('糖尿病数据集load-diabetes是经典的用于回归任务的数据集')
print('手写数字数据集load_digits适用于多分类任务的数据集')
print('体能训练数据集load-linnerud是经典的用于多变量回归任务的数据集,其内部包含两个小数据集'
      ':Excise是对三个训练变量(引体向上、仰卧起坐、立定跳远)的20次观测;physiological是对'
      '三个生理学变量(体重、腰围、脉搏)的20次观测')
print(load_linnerud().keys())
print(load_linnerud().target)
print(load_linnerud().target_names)
print(load_linnerud().feature_names)
print('葡萄酒数据集load-wine包括了3中酒中13中不同成分的数量,共178个样本,对应三种葡萄酒')
print(load_wine().target_names)
print(load_wine().keys())
print(load_wine().feature_names)
Ejemplo n.º 25
0
def experimentVariables(projectName):
    '''
    This function returns all the variables necessary to start a experiment including:
    name of the experiment
    datasets locations
    variables to report from the experiment
    what to report from the experiment
    '''
    computerName=os.environ.get('COMPUTERNAME')
    if projectName== 'unlabeledModelS':
        if computerName=='JULIAN':
            from sklearn.datasets import  load_boston, load_iris, load_diabetes, load_digits, load_linnerud
            
            datasets={'boston':load_boston(),'iris':load_iris(),'diabetes':load_diabetes(),'digits':load_digits(),'linnerud':load_linnerud()}
            print('working at JULIAN@CMU')
            dataset='digits'
            numTests=50
            experimentName='One'
            agmntlvl=0
            description='here the description of this experiment'
            data=datasets[dataset]['data']
            labels=datasets[dataset]['target']
            verbose=0
            plots=False
            signal2plot='f1_score_mv_predval_agmnt'
#             signal2plot='f1_score_val_predval_agmnt'
            
            #The next are variables that store the outcomes from the
            #experiment
            variables=\
            'spear=[]\n'

            return {'dataset':dataset,'numTests':numTests,'experimentName':experimentName,\
                    'description':description,'agmntlvl':agmntlvl,'variables':variables,'data':data,\
                    'labels':labels,'verbose':verbose,'plots':plots,'signal2plot':signal2plot}
        else:
            print('Variables not defined for Julian@Laptop')
Ejemplo n.º 26
0
def loadRegSample(self):
    ''' load regression sample dataset '''
    self.data = load_linnerud()
    logger.info(self.data.DESCR)
Ejemplo n.º 27
0
def test_predictions():

    d = load_linnerud()
    X = d.data
    Y = d.target
    tol = 5e-12
    miter = 1000
    num_comp = 2
    Xorig = X.copy()
    Yorig = Y.copy()
#    SSY = np.sum(Yorig**2)
#    center = True
    scale  = False


    pls1 = PLSRegression(n_components = num_comp, scale = scale,
                 tol = tol, max_iter = miter, copy = True)
    pls1.fit(Xorig, Yorig)
    Yhat1 = pls1.predict(Xorig)

    SSYdiff1 = np.sum((Yorig-Yhat1)**2)
#    print "PLSRegression: R2Yhat = %.4f" % (1 - (SSYdiff1 / SSY))

    # Compare PLSR and sklearn.PLSRegression
    pls3 = PLSR(num_comp = num_comp, center = True, scale = scale,
                tolerance = tol, max_iter = miter)
    pls3.fit(X, Y)
    Yhat3 = pls3.predict(X)

    assert_array_almost_equal(Yhat1, Yhat3, decimal = 5,
            err_msg = "PLSR gives wrong prediction")

    SSYdiff3 = np.sum((Yorig-Yhat3)**2)
#    print "PLSR         : R2Yhat = %.4f" % (1 - (SSYdiff3 / SSY))

    assert abs(SSYdiff1 - SSYdiff3) < 0.00005


    pls2 = PLSCanonical(n_components = num_comp, scale = scale,
                        tol = tol, max_iter = miter, copy = True)
    pls2.fit(Xorig, Yorig)
    Yhat2 = pls2.predict(Xorig)

    SSYdiff2 = np.sum((Yorig-Yhat2)**2)
#    print "PLSCanonical : R2Yhat = %.4f" % (1 - (SSYdiff2 / SSY))

    # Compare PLSC and sklearn.PLSCanonical
    pls4 = PLSC(num_comp = num_comp, center = True, scale = scale,
                tolerance = tol, max_iter = miter)
    pls4.fit(X, Y)
    Yhat4 = pls4.predict(X)

    SSYdiff4 = np.sum((Yorig-Yhat4)**2)
#    print "PLSC         : R2Yhat = %.4f" % (1 - (SSYdiff4 / SSY))

    # Compare O2PLS and sklearn.PLSCanonical
    pls5 = O2PLS(num_comp = [num_comp, 1, 0], center = True, scale = scale,
                 tolerance = tol, max_iter = miter)
    pls5.fit(X, Y)
    Yhat5 = pls5.predict(X)

    SSYdiff5 = np.sum((Yorig-Yhat5)**2)
#    print "O2PLS        : R2Yhat = %.4f" % (1 - (SSYdiff5 / SSY))

    assert abs(SSYdiff2 - SSYdiff4) < 0.00005
    assert SSYdiff2 > SSYdiff5
def test_update_database():
    sm = tm.Surrogate_Models()
    variables, objectives = datasets.load_linnerud(return_X_y=True)
    sm.random = 57757
    sm.update_database(np.ndarray.tolist(variables),
                       np.ndarray.tolist(objectives))
    ind_var_given = [[
        11,
        230,
        80,
    ], [
        6,
        70,
        31,
    ], [
        2,
        110,
        43,
    ], [
        14,
        215,
        105,
    ], [
        15,
        225,
        73,
    ], [
        4,
        60,
        25,
    ], [
        12,
        105,
        37,
    ], [
        12,
        101,
        101,
    ], [
        13,
        210,
        115,
    ], [
        13,
        155,
        58,
    ], [
        2,
        110,
        60,
    ], [
        15,
        200,
        40,
    ], [
        6,
        125,
        40,
    ], [
        8,
        101,
        38,
    ], [
        17,
        120,
        38,
    ]]
    obj_var_given = [[
        157,
        32,
        52,
    ], [
        193,
        36,
        46,
    ], [
        138,
        33,
        68,
    ], [
        154,
        34,
        64,
    ], [
        156,
        33,
        54,
    ], [
        176,
        37,
        54,
    ], [
        162,
        35,
        62,
    ], [
        193,
        38,
        58,
    ], [
        166,
        33,
        52,
    ], [
        189,
        35,
        46,
    ], [
        189,
        37,
        52,
    ], [
        176,
        31,
        74,
    ], [
        167,
        34,
        60,
    ], [
        211,
        38,
        56,
    ], [
        169,
        34,
        50,
    ]]
    np.testing.assert_array_equal(sm.var_train, ind_var_given)
    np.testing.assert_array_equal(sm.obj_train, obj_var_given)
    assert len(sm.var_test) == 5
    assert len(sm.obj_test) == 5

    sm.update_database([
        [
            12,
            250,
            85,
        ],
        [
            12,
            250,
            85,
        ],
    ], [[
        165,
        33,
        57,
    ], [
        165,
        33,
        57,
    ]])
    assert len(sm.var_train) == 16
    assert len(sm.obj_train) == 16
    assert len(sm.var_test) == 6
    assert len(sm.obj_test) == 6
Ejemplo n.º 29
0
import numpy as np
import pandas as pd

# Load the dataset
from sklearn.datasets import load_linnerud

linnerud_data = load_linnerud()
X = linnerud_data.data
y = linnerud_data.target

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.linear_model import LinearRegression
from sklearn import cross_validation
from sklearn.metrics import mean_squared_error as mse

x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, y)

reg1 = DecisionTreeRegressor()
reg1.fit(x_train, y_train)
mean_absolute_error_tree = mae(reg1.predict(x_test), y_test)
mean_squared_error_tree = mse(reg1.predict(x_test), y_test)
print "Decision Tree mean absolute error: {:.2f}".format(
    mean_absolute_error_tree)
print "Decision Tree mean absolute error: {:.2f}".format(
    mean_squared_error_tree)

reg2 = LinearRegression()
reg2.fit(x_train, y_train)
mean_absolute_error_linear = mae(reg2.predict(x_test), y_test)
mean_squared_error_linear = mse(reg2.predict(x_test), y_test)
Ejemplo n.º 30
0
    
    x_data = np.zeros([20,2])
    x_data[:,0] = 1
    x_data[:,1] = x
    y_data = y
    
    y_data = np.expand_dims(y_data,axis=1)
    
    # compute the weights
    W = np.dot(np.dot(inv((np.dot(x_data.T,x_data))),x_data.T),y_data)
        
    return W

# -- Get data
    
data_set = load_linnerud()

raw_data = data_set.data # Chins, Situps, Jumps
features_names = data_set.feature_names

target_data = data_set.target  # Weight, Waist, Pulse
target_names = data_set.target_names

fig, axis  = plt.subplots(3, 3)
fig.set_size_inches(20,25)

for i in range(len(target_names)):
    x_temp = target_data[:,i] 
    
    
    for j in range(len(features_names)):
Ejemplo n.º 31
0
def test_linnerud_data():
    X, y = load_linnerud(return_X_y=True)
    assert apply_toy_on(X, y) > -3000
Ejemplo n.º 32
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from mpl_toolkits.mplot3d import Axes3D


lin = datasets.load_linnerud()

lin_features = lin.feature_names

X = lin.data[:, np.newaxis, 2]
X_train = X[:-15]
X_test = X[-15:]
y_train = lin.target[:-15]
y_test = lin.target[-15:]

regression = linear_model.LinearRegression()

# training
regression.fit(X_train, y_train)

# prediction
y_pred = regression.predict(X_test)

print('Coeficients: \n', regression.coef_)

print("Mean squared error: \n %.2f" % mean_squared_error(y_test, y_pred))

print('Variance Score: \n %.2f' %r2_score(y_test, y_pred))
Ejemplo n.º 33
0
#exec(open('.\\trees\\sklearn\\datasets.py').read())
import subprocess as sp
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris
from sklearn.datasets import load_diabetes
from sklearn.datasets import load_digits
from sklearn.datasets import load_linnerud
from sklearn.datasets import load_wine
from sklearn.datasets import load_breast_cancer

if __name__ == '__main__':
    sp.call('cls', shell=True)

    # load the iris dataset
    ds = dict()
    ds['iris'] = load_iris()
    ds['boston'] = load_boston()
    ds['iris'] = load_iris()
    ds['diabetes'] = load_diabetes()
    ds['digits'] = load_digits()
    ds['linnerud'] = load_linnerud()
    ds['wine'] = load_wine()
    ds['breastcancer'] = load_breast_cancer()

    # print the keys of every dataset
    for key in ds:
        print(key)
        for key2 in ds[key]:
            print('{0}{1}'.format('    ', key2))
Ejemplo n.º 34
0
import numpy as np
from numpy.testing import assert_array_almost_equal
from sklearn.datasets import load_linnerud
from sklearn import pls

d = load_linnerud()
X = d.data
Y = d.target


def test_pls():
    n_components = 2
    # 1) Canonical (symetric) PLS (PLS 2 blocks canonical mode A)
    # ===========================================================
    # Compare 2 algo.: nipals vs. svd
    # ------------------------------
    pls_bynipals = pls.PLSCanonical(n_components=n_components)
    pls_bynipals.fit(X, Y)
    pls_bysvd = pls.PLSCanonical(algorithm="svd", n_components=n_components)
    pls_bysvd.fit(X, Y)
    # check that the loading vectors are highly correlated
    assert_array_almost_equal(
        [
            np.abs(np.corrcoef(pls_bynipals.x_loadings_[:, k], pls_bysvd.x_loadings_[:, k])[1, 0])
            for k in xrange(n_components)
        ],
        np.ones(n_components),
        err_msg="nipals and svd implementation lead to different x loadings",
    )

    assert_array_almost_equal(
Ejemplo n.º 35
0
 def load_multi_data(self):
     x, y = datasets.load_linnerud(return_X_y=True)
     self.__inputs = x.tolist()
     self.__outputs = y.tolist()
Ejemplo n.º 36
0
#solution_dsci_chapter_02_diabetes.py

"""Data from scikit-learn's traing data, and are based on 
   clinical data available here:   
   http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html
   Documentation for this
   and lots of other machine learning sets can be found here:
   http://scikit-learn.org/stable/datasets/index.html
"""   

import sklearn.datasets as ds
import pandas as pd
#everything=ds.load_diabetes()
everything=ds.load_linnerud()

#build DataFrame objects with contents of the data set
eser=pd.DataFrame(everything['data'], columns=everything['feature_names'])
pser=pd.DataFrame(everything['target'], columns=everything['target_names'])
               
#combine the two series a column at a time             
for c in pser.columns:
	eser.assign(c=pser[c], inplace=True)   #alternative:  eser[c]=pser[c]
	x=1

#create an "exercise index"
eser.assign(eindex=eser['Chins'] * 3 + 
                   eser["Situps"] * 2 + 
                   eser["Jumps"])

#convert the Weight to kilos (2.2 lb = 1 kg)
eser['Weight']=eser['Weight']/2.2
Ejemplo n.º 37
0
def create_linnerud():
    linnerud_data = datasets.load_linnerud()
    assert False
Ejemplo n.º 38
0
# This tutorial is more about understanding datasets in scikit

# Toy Dataset
# By default scikit comes with preloaded data set for practicing machine leaning algorithms.
from sklearn import datasets
iris_data = datasets.load_iris()  # Classification
wine_data = datasets.load_wine()  # Classification
cancer_data = datasets.load_breast_cancer()  # Classification
diabetes_data = datasets.load_diabetes()  # Regression
boston_data = datasets.load_boston()  # Regression
linnerud_data = datasets.load_linnerud()  # Multivariate Regression

# Accessing data
# By default datasets return Bunch - Dictionary like object.
# Bunch has target variable (Y) and feature variable (X).
X = iris_data.get("data")
Y = iris_data.get("target")
feature_names = iris_data.get("feature_names")
target_names = iris_data.get("target_names")  # nothing but lable name

X = diabetes_data.get("data")
Y = diabetes_data.get("target")
feature_names = diabetes_data.get("feature_names")
# for regression problems there is no target_names

# for more details about dataset visit
# http://scikit-learn.org/stable/datasets/index.html#datasets
Ejemplo n.º 39
0
# coding=utf-8

from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt

d1 = datasets.load_iris() #
d2 = datasets.load_breast_cancer() #乳腺癌数据
d3 = datasets.load_digits() #手写数字
d4 = datasets.load_boston() #波士顿房价
d5 = datasets.load_linnerud() #体能数据集


print(d3.keys())
samples,features = d3.data.shape
print(samples,features)
print(d3.images.shape)

#print(d1.data)
#print(d1.target)
print(d3.target_names)

print(np.bincount(d1.target))

x_index = 3
colors = ['blue','red','green']

'''
for label,color in zip(range(len(d1.target_names)),colors):
    plt.hist(d1.data[d1.target==label,x_index],label=d1.target_names[label],color=color) #直方图
Ejemplo n.º 40
0
def test_pls():
    d = load_linnerud()
    X = d.data
    Y = d.target
    # 1) Canonical (symetric) PLS (PLS 2 blocks canonical mode A)
    # ===========================================================
    # Compare 2 algo.: nipals vs. svd
    # ------------------------------
    pls_bynipals = pls.PLSCanonical(n_components=X.shape[1])
    pls_bynipals.fit(X, Y)
    pls_bysvd = pls.PLSCanonical(algorithm="svd", n_components=X.shape[1])
    pls_bysvd.fit(X, Y)
    # check equalities of loading (up to the sign of the second column)
    assert_array_almost_equal(
        pls_bynipals.x_loadings_,
        np.multiply(pls_bysvd.x_loadings_, np.array([1, -1, 1])), decimal=5,
        err_msg="nipals and svd implementation lead to different x loadings")

    assert_array_almost_equal(
        pls_bynipals.y_loadings_,
        np.multiply(pls_bysvd.y_loadings_, np.array([1, -1, 1])), decimal=5,
        err_msg="nipals and svd implementation lead to different y loadings")

    # Check PLS properties (with n_components=X.shape[1])
    # ---------------------------------------------------
    plsca = pls.PLSCanonical(n_components=X.shape[1])
    plsca.fit(X, Y)
    T = plsca.x_scores_
    P = plsca.x_loadings_
    Wx = plsca.x_weights_
    U = plsca.y_scores_
    Q = plsca.y_loadings_
    Wy = plsca.y_weights_

    def check_ortho(M, err_msg):
        K = np.dot(M.T, M)
        assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg)

    # Orthogonality of weights
    # ~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(Wx, "x weights are not orthogonal")
    check_ortho(Wy, "y weights are not orthogonal")

    # Orthogonality of latent scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(T, "x scores are not orthogonal")
    check_ortho(U, "y scores are not orthogonal")

    # Check X = TP' and Y = UQ' (with (p == q) components)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # center scale X, Y
    Xc, Yc, x_mean, y_mean, x_std, y_std =\
        pls._center_scale_xy(X.copy(), Y.copy(), scale=True)
    assert_array_almost_equal(Xc, np.dot(T, P.T), err_msg="X != TP'")
    assert_array_almost_equal(Yc, np.dot(U, Q.T), err_msg="Y != UQ'")

    # Check that rotations on training data lead to scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Xr = plsca.transform(X)
    assert_array_almost_equal(Xr, plsca.x_scores_,
                              err_msg="rotation on X failed")
    Xr, Yr = plsca.transform(X, Y)
    assert_array_almost_equal(Xr, plsca.x_scores_,
                              err_msg="rotation on X failed")
    assert_array_almost_equal(Yr, plsca.y_scores_,
                              err_msg="rotation on Y failed")

    # "Non regression test" on canonical PLS
    # --------------------------------------
    # The results were checked against the R-package plspm
    pls_ca = pls.PLSCanonical(n_components=X.shape[1])
    pls_ca.fit(X, Y)

    x_weights = np.array(
        [[-0.61330704,  0.25616119, -0.74715187],
         [-0.74697144,  0.11930791,  0.65406368],
         [-0.25668686, -0.95924297, -0.11817271]])
    assert_array_almost_equal(pls_ca.x_weights_, x_weights)

    x_rotations = np.array(
        [[-0.61330704,  0.41591889, -0.62297525],
         [-0.74697144,  0.31388326,  0.77368233],
         [-0.25668686, -0.89237972, -0.24121788]])
    assert_array_almost_equal(pls_ca.x_rotations_, x_rotations)

    y_weights = np.array(
        [[+0.58989127,  0.7890047,   0.1717553],
         [+0.77134053, -0.61351791,  0.16920272],
         [-0.23887670, -0.03267062,  0.97050016]])
    assert_array_almost_equal(pls_ca.y_weights_, y_weights)

    y_rotations = np.array(
        [[+0.58989127,  0.7168115,  0.30665872],
         [+0.77134053, -0.70791757,  0.19786539],
         [-0.23887670, -0.00343595,  0.94162826]])
    assert_array_almost_equal(pls_ca.y_rotations_, y_rotations)

    # 2) Regression PLS (PLS2): "Non regression test"
    # ===============================================
    # The results were checked against the R-packages plspm, misOmics and pls
    pls_2 = pls.PLSRegression(n_components=X.shape[1])
    pls_2.fit(X, Y)

    x_weights = np.array(
        [[-0.61330704, -0.00443647,  0.78983213],
         [-0.74697144, -0.32172099, -0.58183269],
         [-0.25668686,  0.94682413, -0.19399983]])
    assert_array_almost_equal(pls_2.x_weights_, x_weights)

    x_loadings = np.array(
        [[-0.61470416, -0.24574278,  0.78983213],
         [-0.65625755, -0.14396183, -0.58183269],
         [-0.51733059,  1.00609417, -0.19399983]])
    assert_array_almost_equal(pls_2.x_loadings_, x_loadings)

    y_weights = np.array(
        [[+0.32456184,  0.29892183,  0.20316322],
         [+0.42439636,  0.61970543,  0.19320542],
         [-0.13143144, -0.26348971, -0.17092916]])
    assert_array_almost_equal(pls_2.y_weights_, y_weights)

    y_loadings = np.array(
        [[+0.32456184,  0.29892183,  0.20316322],
         [+0.42439636,  0.61970543,  0.19320542],
         [-0.13143144, -0.26348971, -0.17092916]])
    assert_array_almost_equal(pls_2.y_loadings_, y_loadings)

    # 3) Another non-regression test of Canonical PLS on random dataset
    # =================================================================
    # The results were checked against the R-package plspm
    n = 500
    p_noise = 10
    q_noise = 5
    # 2 latents vars:
    np.random.seed(11)
    l1 = np.random.normal(size=n)
    l2 = np.random.normal(size=n)
    latents = np.array([l1, l1, l2, l2]).T
    X = latents + np.random.normal(size=4 * n).reshape((n, 4))
    Y = latents + np.random.normal(size=4 * n).reshape((n, 4))
    X = np.concatenate(
        (X, np.random.normal(size=p_noise * n).reshape(n, p_noise)), axis=1)
    Y = np.concatenate(
        (Y, np.random.normal(size=q_noise * n).reshape(n, q_noise)), axis=1)
    np.random.seed(None)
    pls_ca = pls.PLSCanonical(n_components=3)
    pls_ca.fit(X, Y)

    x_weights = np.array(
        [[0.65803719,  0.19197924,  0.21769083],
         [0.7009113,  0.13303969, -0.15376699],
         [0.13528197, -0.68636408,  0.13856546],
         [0.16854574, -0.66788088, -0.12485304],
         [-0.03232333, -0.04189855,  0.40690153],
         [0.1148816, -0.09643158,  0.1613305],
         [0.04792138, -0.02384992,  0.17175319],
         [-0.06781, -0.01666137, -0.18556747],
         [-0.00266945, -0.00160224,  0.11893098],
         [-0.00849528, -0.07706095,  0.1570547],
         [-0.00949471, -0.02964127,  0.34657036],
         [-0.03572177,  0.0945091,  0.3414855],
         [0.05584937, -0.02028961, -0.57682568],
         [0.05744254, -0.01482333, -0.17431274]])
    assert_array_almost_equal(pls_ca.x_weights_, x_weights)

    x_loadings = np.array(
        [[0.65649254,  0.1847647,  0.15270699],
         [0.67554234,  0.15237508, -0.09182247],
         [0.19219925, -0.67750975,  0.08673128],
         [0.2133631, -0.67034809, -0.08835483],
         [-0.03178912, -0.06668336,  0.43395268],
         [0.15684588, -0.13350241,  0.20578984],
         [0.03337736, -0.03807306,  0.09871553],
         [-0.06199844,  0.01559854, -0.1881785],
         [0.00406146, -0.00587025,  0.16413253],
         [-0.00374239, -0.05848466,  0.19140336],
         [0.00139214, -0.01033161,  0.32239136],
         [-0.05292828,  0.0953533,  0.31916881],
         [0.04031924, -0.01961045, -0.65174036],
         [0.06172484, -0.06597366, -0.1244497]])
    assert_array_almost_equal(pls_ca.x_loadings_, x_loadings)

    y_weights = np.array(
        [[0.66101097,  0.18672553,  0.22826092],
         [0.69347861,  0.18463471, -0.23995597],
         [0.14462724, -0.66504085,  0.17082434],
         [0.22247955, -0.6932605, -0.09832993],
         [0.07035859,  0.00714283,  0.67810124],
         [0.07765351, -0.0105204, -0.44108074],
         [-0.00917056,  0.04322147,  0.10062478],
         [-0.01909512,  0.06182718,  0.28830475],
         [0.01756709,  0.04797666,  0.32225745]])
    assert_array_almost_equal(pls_ca.y_weights_, y_weights)

    y_loadings = np.array(
        [[0.68568625,  0.1674376,  0.0969508],
         [0.68782064,  0.20375837, -0.1164448],
         [0.11712173, -0.68046903,  0.12001505],
         [0.17860457, -0.6798319, -0.05089681],
         [0.06265739, -0.0277703,  0.74729584],
         [0.0914178,  0.00403751, -0.5135078],
         [-0.02196918, -0.01377169,  0.09564505],
         [-0.03288952,  0.09039729,  0.31858973],
         [0.04287624,  0.05254676,  0.27836841]])
    assert_array_almost_equal(pls_ca.y_loadings_, y_loadings)

    # Orthogonality of weights
    # ~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(pls_ca.x_weights_, "x weights are not orthogonal")
    check_ortho(pls_ca.y_weights_, "y weights are not orthogonal")

    # Orthogonality of latent scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(pls_ca.x_scores_, "x scores are not orthogonal")
    check_ortho(pls_ca.y_scores_, "y scores are not orthogonal")
Ejemplo n.º 41
0
def test_pls():
    d = load_linnerud()
    X = d.data
    Y = d.target
    # 1) Canonical (symmetric) PLS (PLS 2 blocks canonical mode A)
    # ===========================================================
    # Compare 2 algo.: nipals vs. svd
    # ------------------------------
    pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1])
    pls_bynipals.fit(X, Y)
    pls_bysvd = pls_.PLSCanonical(algorithm="svd", n_components=X.shape[1])
    pls_bysvd.fit(X, Y)
    # check equalities of loading (up to the sign of the second column)
    assert_array_almost_equal(
        pls_bynipals.x_loadings_,
        pls_bysvd.x_loadings_, decimal=5,
        err_msg="nipals and svd implementations lead to different x loadings")

    assert_array_almost_equal(
        pls_bynipals.y_loadings_,
        pls_bysvd.y_loadings_, decimal=5,
        err_msg="nipals and svd implementations lead to different y loadings")

    # Check PLS properties (with n_components=X.shape[1])
    # ---------------------------------------------------
    plsca = pls_.PLSCanonical(n_components=X.shape[1])
    plsca.fit(X, Y)
    T = plsca.x_scores_
    P = plsca.x_loadings_
    Wx = plsca.x_weights_
    U = plsca.y_scores_
    Q = plsca.y_loadings_
    Wy = plsca.y_weights_

    def check_ortho(M, err_msg):
        K = np.dot(M.T, M)
        assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg)

    # Orthogonality of weights
    # ~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(Wx, "x weights are not orthogonal")
    check_ortho(Wy, "y weights are not orthogonal")

    # Orthogonality of latent scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(T, "x scores are not orthogonal")
    check_ortho(U, "y scores are not orthogonal")

    # Check X = TP' and Y = UQ' (with (p == q) components)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # center scale X, Y
    Xc, Yc, x_mean, y_mean, x_std, y_std =\
        pls_._center_scale_xy(X.copy(), Y.copy(), scale=True)
    assert_array_almost_equal(Xc, np.dot(T, P.T), err_msg="X != TP'")
    assert_array_almost_equal(Yc, np.dot(U, Q.T), err_msg="Y != UQ'")

    # Check that rotations on training data lead to scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    Xr = plsca.transform(X)
    assert_array_almost_equal(Xr, plsca.x_scores_,
                              err_msg="rotation on X failed")
    Xr, Yr = plsca.transform(X, Y)
    assert_array_almost_equal(Xr, plsca.x_scores_,
                              err_msg="rotation on X failed")
    assert_array_almost_equal(Yr, plsca.y_scores_,
                              err_msg="rotation on Y failed")

    # "Non regression test" on canonical PLS
    # --------------------------------------
    # The results were checked against the R-package plspm
    pls_ca = pls_.PLSCanonical(n_components=X.shape[1])
    pls_ca.fit(X, Y)

    x_weights = np.array(
        [[-0.61330704,  0.25616119, -0.74715187],
         [-0.74697144,  0.11930791,  0.65406368],
         [-0.25668686, -0.95924297, -0.11817271]])
    # x_weights_sign_flip holds columns of 1 or -1, depending on sign flip
    # between R and python
    x_weights_sign_flip = pls_ca.x_weights_ / x_weights

    x_rotations = np.array(
        [[-0.61330704,  0.41591889, -0.62297525],
         [-0.74697144,  0.31388326,  0.77368233],
         [-0.25668686, -0.89237972, -0.24121788]])
    x_rotations_sign_flip = pls_ca.x_rotations_ / x_rotations

    y_weights = np.array(
        [[+0.58989127,  0.7890047,   0.1717553],
         [+0.77134053, -0.61351791,  0.16920272],
         [-0.23887670, -0.03267062,  0.97050016]])
    y_weights_sign_flip = pls_ca.y_weights_ / y_weights

    y_rotations = np.array(
        [[+0.58989127,  0.7168115,  0.30665872],
         [+0.77134053, -0.70791757,  0.19786539],
         [-0.23887670, -0.00343595,  0.94162826]])
    y_rotations_sign_flip = pls_ca.y_rotations_ / y_rotations

    # x_weights = X.dot(x_rotation)
    # Hence R/python sign flip should be the same in x_weight and x_rotation
    assert_array_almost_equal(x_rotations_sign_flip, x_weights_sign_flip)
    # This test that R / python give the same result up to column
    # sign indeterminacy
    assert_array_almost_equal(np.abs(x_rotations_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)


    assert_array_almost_equal(y_rotations_sign_flip, y_weights_sign_flip)
    assert_array_almost_equal(np.abs(y_rotations_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)

    # 2) Regression PLS (PLS2): "Non regression test"
    # ===============================================
    # The results were checked against the R-packages plspm, misOmics and pls
    pls_2 = pls_.PLSRegression(n_components=X.shape[1])
    pls_2.fit(X, Y)

    x_weights = np.array(
        [[-0.61330704, -0.00443647,  0.78983213],
         [-0.74697144, -0.32172099, -0.58183269],
         [-0.25668686,  0.94682413, -0.19399983]])
    x_weights_sign_flip = pls_2.x_weights_ / x_weights

    x_loadings = np.array(
        [[-0.61470416, -0.24574278,  0.78983213],
         [-0.65625755, -0.14396183, -0.58183269],
         [-0.51733059,  1.00609417, -0.19399983]])
    x_loadings_sign_flip = pls_2.x_loadings_ / x_loadings

    y_weights = np.array(
        [[+0.32456184,  0.29892183,  0.20316322],
         [+0.42439636,  0.61970543,  0.19320542],
         [-0.13143144, -0.26348971, -0.17092916]])
    y_weights_sign_flip = pls_2.y_weights_ / y_weights

    y_loadings = np.array(
        [[+0.32456184,  0.29892183,  0.20316322],
         [+0.42439636,  0.61970543,  0.19320542],
         [-0.13143144, -0.26348971, -0.17092916]])
    y_loadings_sign_flip = pls_2.y_loadings_ / y_loadings

    # x_loadings[:, i] = Xi.dot(x_weights[:, i]) \forall i
    assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4)
    assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)

    assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4)
    assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)

    # 3) Another non-regression test of Canonical PLS on random dataset
    # =================================================================
    # The results were checked against the R-package plspm
    n = 500
    p_noise = 10
    q_noise = 5
    # 2 latents vars:
    rng = check_random_state(11)
    l1 = rng.normal(size=n)
    l2 = rng.normal(size=n)
    latents = np.array([l1, l1, l2, l2]).T
    X = latents + rng.normal(size=4 * n).reshape((n, 4))
    Y = latents + rng.normal(size=4 * n).reshape((n, 4))
    X = np.concatenate(
        (X, rng.normal(size=p_noise * n).reshape(n, p_noise)), axis=1)
    Y = np.concatenate(
        (Y, rng.normal(size=q_noise * n).reshape(n, q_noise)), axis=1)

    pls_ca = pls_.PLSCanonical(n_components=3)
    pls_ca.fit(X, Y)

    x_weights = np.array(
        [[0.65803719,  0.19197924,  0.21769083],
         [0.7009113,  0.13303969, -0.15376699],
         [0.13528197, -0.68636408,  0.13856546],
         [0.16854574, -0.66788088, -0.12485304],
         [-0.03232333, -0.04189855,  0.40690153],
         [0.1148816, -0.09643158,  0.1613305],
         [0.04792138, -0.02384992,  0.17175319],
         [-0.06781, -0.01666137, -0.18556747],
         [-0.00266945, -0.00160224,  0.11893098],
         [-0.00849528, -0.07706095,  0.1570547],
         [-0.00949471, -0.02964127,  0.34657036],
         [-0.03572177,  0.0945091,  0.3414855],
         [0.05584937, -0.02028961, -0.57682568],
         [0.05744254, -0.01482333, -0.17431274]])
    x_weights_sign_flip = pls_ca.x_weights_ / x_weights


    x_loadings = np.array(
        [[0.65649254,  0.1847647,  0.15270699],
         [0.67554234,  0.15237508, -0.09182247],
         [0.19219925, -0.67750975,  0.08673128],
         [0.2133631, -0.67034809, -0.08835483],
         [-0.03178912, -0.06668336,  0.43395268],
         [0.15684588, -0.13350241,  0.20578984],
         [0.03337736, -0.03807306,  0.09871553],
         [-0.06199844,  0.01559854, -0.1881785],
         [0.00406146, -0.00587025,  0.16413253],
         [-0.00374239, -0.05848466,  0.19140336],
         [0.00139214, -0.01033161,  0.32239136],
         [-0.05292828,  0.0953533,  0.31916881],
         [0.04031924, -0.01961045, -0.65174036],
         [0.06172484, -0.06597366, -0.1244497]])
    x_loadings_sign_flip = pls_ca.x_loadings_ / x_loadings

    y_weights = np.array(
        [[0.66101097,  0.18672553,  0.22826092],
         [0.69347861,  0.18463471, -0.23995597],
         [0.14462724, -0.66504085,  0.17082434],
         [0.22247955, -0.6932605, -0.09832993],
         [0.07035859,  0.00714283,  0.67810124],
         [0.07765351, -0.0105204, -0.44108074],
         [-0.00917056,  0.04322147,  0.10062478],
         [-0.01909512,  0.06182718,  0.28830475],
         [0.01756709,  0.04797666,  0.32225745]])
    y_weights_sign_flip = pls_ca.y_weights_ / y_weights

    y_loadings = np.array(
        [[0.68568625,  0.1674376,  0.0969508],
         [0.68782064,  0.20375837, -0.1164448],
         [0.11712173, -0.68046903,  0.12001505],
         [0.17860457, -0.6798319, -0.05089681],
         [0.06265739, -0.0277703,  0.74729584],
         [0.0914178,  0.00403751, -0.5135078],
         [-0.02196918, -0.01377169,  0.09564505],
         [-0.03288952,  0.09039729,  0.31858973],
         [0.04287624,  0.05254676,  0.27836841]])
    y_loadings_sign_flip = pls_ca.y_loadings_ / y_loadings

    assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4)
    assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4)

    assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4)
    assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4)
    assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4)

    # Orthogonality of weights
    # ~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(pls_ca.x_weights_, "x weights are not orthogonal")
    check_ortho(pls_ca.y_weights_, "y weights are not orthogonal")

    # Orthogonality of latent scores
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    check_ortho(pls_ca.x_scores_, "x scores are not orthogonal")
    check_ortho(pls_ca.y_scores_, "y scores are not orthogonal")
Ejemplo n.º 42
0
def test_load_linnerud():
    res = load_linnerud()
    assert_equal(res.data.shape, (20, 3))
    assert_equal(res.target.shape, (20, 3))
    assert_equal(len(res.target_names), 3)
    assert_true(res.DESCR)
Ejemplo n.º 43
0
def scikitAlgorithms_UCIDataset(input_dict):
    from sklearn import datasets
    allDSets = {"iris":datasets.load_iris(), "boston":datasets.load_boston(), "diabetes":datasets.load_diabetes(), " linnerud":datasets.load_linnerud()}
    dataset = allDSets[input_dict['dsIn']]
    output_dict = {}
    output_dict['dtsOut'] = dataset#(dataset.data, dataset.target)
    return output_dict
Ejemplo n.º 44
0
    #     y_test = _make_matrix_classes(y_test)

    #     rna.fit(X_train, y_train)
    #     y_pred = rna.predict(X_test)
    #     s = rna.score(X_test, y_test)
    #     print("score: {}".format(s))

    with MLP(
            mse_target=1e-8,
            max_epochs=1e8,
            hidden_layer_sizes=(6, 30, 30, 6),
            output_fn=tf.nn.sigmoid,
            verbose=True,
            learning_rate=1e-6,
    ) as rna:
        linnerud = load_linnerud()

        ss = ShuffleSplit()
        train_index, test_index = next(iter(ss.split(linnerud.data)))

        X_train = linnerud.data[train_index]
        y_train = linnerud.target[train_index]
        X_test = linnerud.data[test_index]
        y_test = linnerud.target[test_index]

        X_train = np.array(X_train)
        y_train = np.array(y_train)
        X_test = np.array(X_test)
        y_test = np.array(y_test)

        norm_x = MinMaxScaler(feature_range=(0.0, 1.0), copy=True)
iris = datasets.load_breast_cancer()
print("the output of load_breast_cancer() :: ", iris)

#load_diabetes() excuted
iris = datasets.load_diabetes()
print("the output of load_diabetes() :: ", iris)

#load_digits() excuted
iris = datasets.load_digits()
print("the output of load_digits() :: ", iris)

#load_iris() excuted
print("the output of load_iris() :: ", datasets.load_iris())

#load_linnerud() excuted
print("the output of load_linnerud() :: ", datasets.load_linnerud())

#load_wine() excuted
print("the output of load_wine() :: ", datasets.load_wine())

#make_blobs() excuted
print("the output of make_blobs() :: ", datasets.make_blobs())

#make_circles() executed
print("the output of make_circles() :: ", datasets.make_circles())

#make_classification() executed
print("the output of make_classification() :: ",
      datasets.make_classification())
#make_friedman1() executed
print("the output of make_friedman1() :: ", datasets.make_friedman1())
Ejemplo n.º 46
0
def test_update_database_dict():
    sm = tm.Surrogate_Models()
    variables, objectives = datasets.load_linnerud(return_X_y=True)
    sm.random = 57757
    test_dict = {}

    i = 0
    for ind, obj in zip(variables, objectives):
        test_dict['core {}'.format(i)] = {
            'independent variables': {},
            'dependent variables': {}
        }
        test_dict['core {}'.format(i)]['independent variables'] = {
            'a': ind[0],
            'b': ind[1],
            'c': ind[2]
        }
        test_dict['core {}'.format(i)]['dependent variables'] = {
            'd': obj[0],
            'e': obj[1],
            'f': obj[2]
        }
        i += 1

    sm.update_database(['a', 'b', 'c'], ['d', 'e', 'f'], database=test_dict)
    ind_var_given = [[
        11,
        230,
        80,
    ], [
        6,
        70,
        31,
    ], [
        2,
        110,
        43,
    ], [
        14,
        215,
        105,
    ], [
        15,
        225,
        73,
    ], [
        4,
        60,
        25,
    ], [
        12,
        105,
        37,
    ], [
        12,
        101,
        101,
    ], [
        13,
        210,
        115,
    ], [
        13,
        155,
        58,
    ], [
        2,
        110,
        60,
    ], [
        15,
        200,
        40,
    ], [
        6,
        125,
        40,
    ], [
        8,
        101,
        38,
    ], [
        17,
        120,
        38,
    ]]
    obj_var_given = [[
        157,
        32,
        52,
    ], [
        193,
        36,
        46,
    ], [
        138,
        33,
        68,
    ], [
        154,
        34,
        64,
    ], [
        156,
        33,
        54,
    ], [
        176,
        37,
        54,
    ], [
        162,
        35,
        62,
    ], [
        193,
        38,
        58,
    ], [
        166,
        33,
        52,
    ], [
        189,
        35,
        46,
    ], [
        189,
        37,
        52,
    ], [
        176,
        31,
        74,
    ], [
        167,
        34,
        60,
    ], [
        211,
        38,
        56,
    ], [
        169,
        34,
        50,
    ]]
    np.testing.assert_array_equal(sm.var_train, ind_var_given)
    np.testing.assert_array_equal(sm.obj_train, obj_var_given)
    assert len(sm.var_test) == 5
    assert len(sm.obj_test) == 5

    test_dict = {
        'core 100': {
            'independent variables': {
                'a': 12,
                'b': 250,
                'c': 85
            },
            'dependent variables': {
                'd': 165,
                'e': 33,
                'f': 57
            }
        },
        'core 101': {
            'independent variables': {
                'a': 12,
                'b': 250,
                'c': 85
            },
            'dependent variables': {
                'd': 165,
                'e': 33,
                'f': 57
            }
        }
    }
    sm.update_database(['a', 'b', 'c'], ['d', 'e', 'f'], database=test_dict)
    assert len(sm.var_train) == 16
    assert len(sm.obj_train) == 16
    assert len(sm.var_test) == 6
    assert len(sm.obj_test) == 6
Ejemplo n.º 47
0
def test_load_linnerud():
    res = load_linnerud()
    assert_equal(res.data.shape, (20, 3))
    assert_equal(res.target.shape, (20, 3))
    assert_equal(len(res.target_names), 3)
    assert_true(res.DESCR)
Ejemplo n.º 48
0
def main():
    start_t = datetime.now()
    # set built-in dataset choices
    ds1 = datasets.load_boston()
    ds2 = datasets.load_iris()
    ds3 = datasets.load_diabetes()
    ds4 = datasets.load_digits()
    ds5 = datasets.load_linnerud()
    ds6 = datasets.load_wine()
    ds7 = datasets.load_breast_cancer()

    # Get user inputs -- add error handeling
    # when response is continous then no 'target_names'
    # when response is catagorical set 'target_names'
    print(""" Select SKLearn dataset to use: ~default is 3~
                    1 - boston (response is continuous)
                    2 - iris (response is catagorical [3 cats])
                    3 - diabetes (response is continuous)
                    4 - digits (response is an array)
                    5 - linnerud (response is catagorical array of continuous)[3 cats])
                    6 - wine (response is catagorical [3 cast])
                    7 - breast cancer (response is catagorical {2cats/bool])
                    8 - user choice
                  """)

    choice = int(input("Which set? " or 3))

    if choice == 1:
        the_ds = ds1
    elif choice == 2:
        the_ds = ds2
    elif choice == 3:
        the_ds = ds3
    elif choice == 4:
        the_ds = ds4
    elif choice == 5:
        the_ds = ds5
    elif choice == 6:
        the_ds = ds6
    elif choice == 7:
        the_ds = ds7
    elif choice == 8:
        the_ds = input("direct path to dataset: ")
    elif choice == "":
        the_ds = ds3
    else:
        print(f"{choice} is an invalid entry. Please try again.")

    print(f"You chose {choice}")
    # turn source data into a pandas df
    working_df = pd.DataFrame(the_ds.data, columns=the_ds.feature_names)
    working_df["target"] = pd.Series(the_ds.target)  # chg
    working_df.head()

    # get column names

    col_list = working_df.columns.values.tolist()
    titles = [
        i.replace(")", "_").replace("(", "").replace(" ", "_")
        for i in col_list
    ]
    working_df.columns = titles
    print_heading("Original Dataset")
    print(working_df)

    # ~RESPONSE~
    # y = the_ds.target  # assign the response here #chg
    res_type = tot(the_ds.target)
    print_heading("Response type is " + res_type)

    # Group Predictor types
    type_mask = [tot(working_df[i]) for i in col_list]
    predictor_array = np.column_stack((col_list, type_mask))
    pred_df = pd.DataFrame(predictor_array, columns=["Predictor", "Category"])

    # ~PREDICTORS~
    # Allow user to select the desired features
    # feature_list = predictor_select(the_ds)
    # enable abovce line to select only specific features to evaluate

    cont_feature_df = pred_df[
        pred_df["Category"] ==
        "continuous"]  # where tot continuous and bool_check false
    try:
        cont_feature_df = cont_feature_df.drop("target", axis=1, inplace=True)
    except Exception:
        pass

    cat_feature_df = pred_df[
        pred_df["Category"] !=
        "continuous"]  # where tot not continuous or bool_check true
    try:
        cat_feature_df = cat_feature_df.drop("target", axis=1, inplace=True)
    except Exception:
        pass

    print("No Continuous!") if cont_feature_df.empty else print(
        cont_feature_df)
    print("No Categorical!") if cat_feature_df.empty else print(cat_feature_df)

    # cont_feature_list = list(cont_feature_df["Predictor"])
    # cat_feature_list = list(cat_feature_df["Predictor"])

    # Make plots
    # if res_type == "continuous":
    cat_con_file_list = plot_cat_cont(the_ds)
    con_con_file_list = plot_cont_cont(the_ds)

    # else:
    con_cat_file_list = plot_cont_cat(the_ds)
    cat_cat_file_list = plot_cat_cat(the_ds)

    # Generate Report DF
    report_col = (
        "Category",
        "p-val_&_t-val",
        "Regression",
        "Logistic_Regression",
        "Difference_with_mean",
        "Random_Forest",
    )
    report_df = pd.DataFrame("", index=col_list, columns=report_col)
    report_df = report_df.drop(["target"])
    pred_df = pred_df.set_index(["Predictor"])
    pred_df = pred_df.drop(["target"])

    # Update Report with data
    report_df.index.name = "Predictor"
    pred_df.index = report_df.index

    # add plots to report
    list_to_df(pred_df, report_df, "Category")
    temp_df = pd.DataFrame(cat_con_file_list, columns=["Logistic_Regression"])
    list_to_df(temp_df, report_df, "Logistic_Regression")
    temp_df = pd.DataFrame(con_con_file_list, columns=["Regression"])
    list_to_df(temp_df, report_df, "Regression")
    temp_df = pd.DataFrame(con_cat_file_list, columns=["Random_Forest"])
    list_to_df(temp_df, report_df, "Random_Forest")
    temp_df = pd.DataFrame(cat_cat_file_list, columns=["Difference_with_mean"])
    list_to_df(temp_df, report_df, "Difference_with_mean")

    # Save report to HTML
    report_df.to_html("HW_report_" +
                      datetime.now().strftime("%Y_%m_%d-%H_%M") + ".html")
    print(datetime.now() - start_t)
    return