def test_load_linnerud(): res = load_linnerud() assert_equal(res.data.shape, (20, 3)) assert_equal(res.target.shape, (20, 3)) assert_equal(len(res.target_names), 3) assert_true(res.DESCR) # test return_X_y option X_y_tuple = load_linnerud(return_X_y=True) bunch = load_linnerud() assert_true(isinstance(X_y_tuple, tuple)) assert_array_equal(X_y_tuple[0], bunch.data) assert_array_equal(X_y_tuple[1], bunch.target)
def test_pls_errors(): d = load_linnerud() X = d.data Y = d.target for clf in [pls_.PLSCanonical(), pls_.PLSRegression(), pls_.PLSSVD()]: clf.n_components = 4 assert_raise_message(ValueError, "Invalid number of components", clf.fit, X, Y)
def test_convergence_fail(): d = load_linnerud() X = d.data Y = d.target pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1], max_iter=2, tol=1e-10) assert_warns(ConvergenceWarning, pls_bynipals.fit, X, Y)
def test_eigsym(): d = load_linnerud() X = d.data n = 3 X = dot(X.T, X) eig = EIGSym(num_comp = n, tolerance = 5e-12) eig.fit(X) Xhat = dot(eig.V, dot(eig.D, eig.V.T)) assert_array_almost_equal(X, Xhat, decimal=4, err_msg="EIGSym does not" \ " give the correct reconstruction of the matrix") [D,V] = np.linalg.eig(X) # linalg.eig does not return the eigenvalues in order, so need to sort idx = np.argsort(D, axis=None).tolist()[::-1] D = D[idx] V = V[:,idx] Xhat = dot(V, dot(np.diag(D), V.T)) V, eig.V = direct(V, eig.V, compare = True) assert_array_almost_equal(V, eig.V, decimal=5, err_msg="EIGSym does not" \ " give the correct eigenvectors")
def test_scale(): d = load_linnerud() X = d.data Y = d.target # causes X[:, -1].std() to be zero X[:, -1] = 1.0
def test_scale(): d = load_linnerud() X = d.data Y = d.target # causes X[:, -1].std() to be zero X[:, -1] = 1.0 for clf in [pls.PLSCanonical(), pls.PLSRegression(), pls.CCA(), pls.PLSSVD()]: clf.set_params(scale=True) clf.fit(X, Y)
def test_load_linnerud(): res = load_linnerud() assert_equal(res.data.shape, (20, 3)) assert_equal(res.target.shape, (20, 3)) assert_equal(len(res.target_names), 3) assert_true(res.DESCR) assert_true(os.path.exists(res.data_filename)) assert_true(os.path.exists(res.target_filename)) # test return_X_y option check_return_X_y(res, partial(load_linnerud))
def test_PLSSVD(): # Let's check the PLSSVD doesn't return all possible component but just # the specified number d = load_linnerud() X = d.data Y = d.target n_components = 2 for clf in [pls_.PLSSVD, pls_.PLSRegression, pls_.PLSCanonical]: pls = clf(n_components=n_components) pls.fit(X, Y) assert_equal(n_components, pls.y_scores_.shape[1])
def test_univariate_pls_regression(): # Ensure 1d Y is correctly interpreted d = load_linnerud() X = d.data Y = d.target clf = pls_.PLSRegression() # Compare 1d to column vector model1 = clf.fit(X, Y[:, 0]).coef_ model2 = clf.fit(X, Y[:, :1]).coef_ assert_array_almost_equal(model1, model2)
def test_scale_and_stability(): # We test scale=True parameter # This allows to check numerical stability over platforms as well d = load_linnerud() X1 = d.data Y1 = d.target # causes X[:, -1].std() to be zero X1[:, -1] = 1.0 # From bug #2821 # Test with X2, T2 s.t. clf.x_score[:, 1] == 0, clf.y_score[:, 1] == 0 # This test robustness of algorithm when dealing with value close to 0 X2 = np.array([[0., 0., 1.], [1., 0., 0.], [2., 2., 2.], [3., 5., 4.]]) Y2 = np.array([[0.1, -0.2], [0.9, 1.1], [6.2, 5.9], [11.9, 12.3]]) for (X, Y) in [(X1, Y1), (X2, Y2)]: X_std = X.std(axis=0, ddof=1) X_std[X_std == 0] = 1 Y_std = Y.std(axis=0, ddof=1) Y_std[Y_std == 0] = 1 X_s = (X - X.mean(axis=0)) / X_std Y_s = (Y - Y.mean(axis=0)) / Y_std for clf in [CCA(), pls_.PLSCanonical(), pls_.PLSRegression(), pls_.PLSSVD()]: clf.set_params(scale=True) X_score, Y_score = clf.fit_transform(X, Y) clf.set_params(scale=False) X_s_score, Y_s_score = clf.fit_transform(X_s, Y_s) assert_array_almost_equal(X_s_score, X_score) assert_array_almost_equal(Y_s_score, Y_score) # Scaling should be idempotent clf.set_params(scale=True) X_score, Y_score = clf.fit_transform(X_s, Y_s) assert_array_almost_equal(X_s_score, X_score) assert_array_almost_equal(Y_s_score, Y_score)
def load_linnerud(): from sklearn.datasets import load_linnerud linnerud = load_linnerud() # print(linnerud.DESCR) print(linnerud.keys()) # print(linnerud.feature_names) # Chins : 懸垂の回数 # Situps : 腹筋の回数 # Jumps : 跳躍 # print(linnerud.target_names) # ['Weight', 'Waist', 'Pulse'] X = linnerud.data y = linnerud.target return SklearnDataGenerator.shuffle(X, y)
def test_predict_transform_copy(): # check that the "copy" keyword works d = load_linnerud() X = d.data Y = d.target clf = pls_.PLSCanonical() X_copy = X.copy() Y_copy = Y.copy() clf.fit(X, Y) # check that results are identical with copy assert_array_almost_equal(clf.predict(X), clf.predict(X.copy(), copy=False)) assert_array_almost_equal(clf.transform(X), clf.transform(X.copy(), copy=False)) # check also if passing Y assert_array_almost_equal(clf.transform(X, Y), clf.transform(X.copy(), Y.copy(), copy=False)) # check that copy doesn't destroy # we do want to check exact equality here assert_array_equal(X_copy, X) assert_array_equal(Y_copy, Y) # also check that mean wasn't zero before (to make sure we didn't touch it) assert_true(np.all(X.mean(axis=0) != 0))
from sklearn.datasets import load_linnerud from sklearn.multioutput import MultiOutputRegressor from sklearn.linear_model import Ridge from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error from sklearn.model_selection import train_test_split from MyMultiOutputRegressor.MultiOutputRegressor import * inputs, outputs = load_linnerud(return_X_y=True) trainInputs, validationInputs, trainOutputs, validationOutputs = train_test_split( inputs, outputs, test_size=0.20, random_state=1) scaler = StandardScaler() scaler.fit(trainInputs) trainInputs = scaler.transform(trainInputs) validationInputs = scaler.transform(validationInputs) scaler.fit(trainOutputs) trainOutputs = scaler.transform(trainOutputs) validationOutputs = scaler.transform(validationOutputs) print("------------------------sklearn multioutput regressor----------------") model = MultiOutputRegressor(Ridge(random_state=1)).fit( trainInputs, trainOutputs) predictedOutputs = model.predict(validationInputs) error = mean_squared_error(validationOutputs, predictedOutputs) print(model.estimators_[0].intercept_, model.estimators_[0].coef_) print(model.estimators_[1].intercept_, model.estimators_[1].coef_) print(model.estimators_[2].intercept_, model.estimators_[2].coef_) print('prediction error', error)
# Import necessary library import numpy as np import pandas as pd # Load the dataset from sklearn.datasets import load_linnerud linnerud_data = load_linnerud() X = linnerud_data.data y = linnerud_data.target from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_squared_error as mse from sklearn.linear_model import LinearRegression # TODO: split the data into training and testing sets, # using the standard settings for train_test_split. # Then, train and test the classifiers with your newly split data instead of X and y. #Solution # Prepare the data as features and labels. features = X labels = y # split the data into training and testing sets from sklearn import cross_validation features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.4, random_state=0) # Create decision tree regressor/algorithm object
This example introduces the Regressor object in a multi-target regression task. """ # Author: Alex Wozniakowski <*****@*****.**> import pandas as pd from sklearn.datasets import load_linnerud from sklearn.model_selection import train_test_split from physlearn import Regressor # Load the data from Sklearn bunch = load_linnerud(as_frame=True) # returns a Bunch instance X, y = bunch['data'], bunch['target'] # Split the data, using the default test_size=0.25. # X_train has shape (15, 3), y_train has shape (15, 3) # X_test has shape (5, 3), and y_test has shape (5, 3). # Namely, there are 3 features and 3 single-target regression subtasks. X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Choose the underlying regressor to be the Sklearn # histogram-based gradient boosting regressor. regressor_choice = 'HistGradientBoostingRegressor' # Choose the Sklearn QuantileTransformer as the data preprocessor. # The output distribution is the Gaussian, e.g., 'normal'. # The number of quantiles is the number of examples in y_train,
def load_UCI_dataset(dsIn): '''Loads a UCI dataset :param dsIn: the dataset name :return: A SciKit dataset ''' from sklearn import datasets allDSets = {"iris":datasets.load_iris(), "boston":datasets.load_boston(), "diabetes":datasets.load_diabetes(), " linnerud":datasets.load_linnerud()} dataset = allDSets[dsIn] return dataset
def test_pls(): d = load_linnerud() X = d.data Y = d.target # 1) Canonical (symmetric) PLS (PLS 2 blocks canonical mode A) # =========================================================== # Compare 2 algo.: nipals vs. svd # ------------------------------ pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1]) pls_bynipals.fit(X, Y) pls_bysvd = pls_.PLSCanonical(algorithm="svd", n_components=X.shape[1]) pls_bysvd.fit(X, Y) # check equalities of loading (up to the sign of the second column) assert_array_almost_equal( pls_bynipals.x_loadings_, pls_bysvd.x_loadings_, decimal=5, err_msg="nipals and svd implementations lead to different x loadings") assert_array_almost_equal( pls_bynipals.y_loadings_, pls_bysvd.y_loadings_, decimal=5, err_msg="nipals and svd implementations lead to different y loadings") # Check PLS properties (with n_components=X.shape[1]) # --------------------------------------------------- plsca = pls_.PLSCanonical(n_components=X.shape[1]) plsca.fit(X, Y) T = plsca.x_scores_ P = plsca.x_loadings_ Wx = plsca.x_weights_ U = plsca.y_scores_ Q = plsca.y_loadings_ Wy = plsca.y_weights_ def check_ortho(M, err_msg): K = np.dot(M.T, M) assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg) # Orthogonality of weights # ~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(Wx, "x weights are not orthogonal") check_ortho(Wy, "y weights are not orthogonal") # Orthogonality of latent scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(T, "x scores are not orthogonal") check_ortho(U, "y scores are not orthogonal") # Check X = TP' and Y = UQ' (with (p == q) components) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # center scale X, Y Xc, Yc, x_mean, y_mean, x_std, y_std =\ pls_._center_scale_xy(X.copy(), Y.copy(), scale=True) assert_array_almost_equal(Xc, np.dot(T, P.T), err_msg="X != TP'") assert_array_almost_equal(Yc, np.dot(U, Q.T), err_msg="Y != UQ'") # Check that rotations on training data lead to scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Xr = plsca.transform(X) assert_array_almost_equal(Xr, plsca.x_scores_, err_msg="rotation on X failed") Xr, Yr = plsca.transform(X, Y) assert_array_almost_equal(Xr, plsca.x_scores_, err_msg="rotation on X failed") assert_array_almost_equal(Yr, plsca.y_scores_, err_msg="rotation on Y failed") # Check that inverse_transform works # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Xreconstructed = plsca.inverse_transform(Xr) assert_array_almost_equal(Xreconstructed, X, err_msg="inverse_transform failed") # "Non regression test" on canonical PLS # -------------------------------------- # The results were checked against the R-package plspm pls_ca = pls_.PLSCanonical(n_components=X.shape[1]) pls_ca.fit(X, Y) x_weights = np.array([[-0.61330704, 0.25616119, -0.74715187], [-0.74697144, 0.11930791, 0.65406368], [-0.25668686, -0.95924297, -0.11817271]]) # x_weights_sign_flip holds columns of 1 or -1, depending on sign flip # between R and python x_weights_sign_flip = pls_ca.x_weights_ / x_weights x_rotations = np.array([[-0.61330704, 0.41591889, -0.62297525], [-0.74697144, 0.31388326, 0.77368233], [-0.25668686, -0.89237972, -0.24121788]]) x_rotations_sign_flip = pls_ca.x_rotations_ / x_rotations y_weights = np.array([[+0.58989127, 0.7890047, 0.1717553], [+0.77134053, -0.61351791, 0.16920272], [-0.23887670, -0.03267062, 0.97050016]]) y_weights_sign_flip = pls_ca.y_weights_ / y_weights y_rotations = np.array([[+0.58989127, 0.7168115, 0.30665872], [+0.77134053, -0.70791757, 0.19786539], [-0.23887670, -0.00343595, 0.94162826]]) y_rotations_sign_flip = pls_ca.y_rotations_ / y_rotations # x_weights = X.dot(x_rotation) # Hence R/python sign flip should be the same in x_weight and x_rotation assert_array_almost_equal(x_rotations_sign_flip, x_weights_sign_flip) # This test that R / python give the same result up to column # sign indeterminacy assert_array_almost_equal(np.abs(x_rotations_sign_flip), 1, 4) assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4) assert_array_almost_equal(y_rotations_sign_flip, y_weights_sign_flip) assert_array_almost_equal(np.abs(y_rotations_sign_flip), 1, 4) assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4) # 2) Regression PLS (PLS2): "Non regression test" # =============================================== # The results were checked against the R-packages plspm, misOmics and pls pls_2 = pls_.PLSRegression(n_components=X.shape[1]) pls_2.fit(X, Y) x_weights = np.array([[-0.61330704, -0.00443647, 0.78983213], [-0.74697144, -0.32172099, -0.58183269], [-0.25668686, 0.94682413, -0.19399983]]) x_weights_sign_flip = pls_2.x_weights_ / x_weights x_loadings = np.array([[-0.61470416, -0.24574278, 0.78983213], [-0.65625755, -0.14396183, -0.58183269], [-0.51733059, 1.00609417, -0.19399983]]) x_loadings_sign_flip = pls_2.x_loadings_ / x_loadings y_weights = np.array([[+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916]]) y_weights_sign_flip = pls_2.y_weights_ / y_weights y_loadings = np.array([[+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916]]) y_loadings_sign_flip = pls_2.y_loadings_ / y_loadings # x_loadings[:, i] = Xi.dot(x_weights[:, i]) \forall i assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4) assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4) assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4) assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4) assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4) assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4) # 3) Another non-regression test of Canonical PLS on random dataset # ================================================================= # The results were checked against the R-package plspm n = 500 p_noise = 10 q_noise = 5 # 2 latents vars: rng = check_random_state(11) l1 = rng.normal(size=n) l2 = rng.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + rng.normal(size=4 * n).reshape((n, 4)) Y = latents + rng.normal(size=4 * n).reshape((n, 4)) X = np.concatenate((X, rng.normal(size=p_noise * n).reshape(n, p_noise)), axis=1) Y = np.concatenate((Y, rng.normal(size=q_noise * n).reshape(n, q_noise)), axis=1) pls_ca = pls_.PLSCanonical(n_components=3) pls_ca.fit(X, Y) x_weights = np.array([[0.65803719, 0.19197924, 0.21769083], [0.7009113, 0.13303969, -0.15376699], [0.13528197, -0.68636408, 0.13856546], [0.16854574, -0.66788088, -0.12485304], [-0.03232333, -0.04189855, 0.40690153], [0.1148816, -0.09643158, 0.1613305], [0.04792138, -0.02384992, 0.17175319], [-0.06781, -0.01666137, -0.18556747], [-0.00266945, -0.00160224, 0.11893098], [-0.00849528, -0.07706095, 0.1570547], [-0.00949471, -0.02964127, 0.34657036], [-0.03572177, 0.0945091, 0.3414855], [0.05584937, -0.02028961, -0.57682568], [0.05744254, -0.01482333, -0.17431274]]) x_weights_sign_flip = pls_ca.x_weights_ / x_weights x_loadings = np.array([[0.65649254, 0.1847647, 0.15270699], [0.67554234, 0.15237508, -0.09182247], [0.19219925, -0.67750975, 0.08673128], [0.2133631, -0.67034809, -0.08835483], [-0.03178912, -0.06668336, 0.43395268], [0.15684588, -0.13350241, 0.20578984], [0.03337736, -0.03807306, 0.09871553], [-0.06199844, 0.01559854, -0.1881785], [0.00406146, -0.00587025, 0.16413253], [-0.00374239, -0.05848466, 0.19140336], [0.00139214, -0.01033161, 0.32239136], [-0.05292828, 0.0953533, 0.31916881], [0.04031924, -0.01961045, -0.65174036], [0.06172484, -0.06597366, -0.1244497]]) x_loadings_sign_flip = pls_ca.x_loadings_ / x_loadings y_weights = np.array([[0.66101097, 0.18672553, 0.22826092], [0.69347861, 0.18463471, -0.23995597], [0.14462724, -0.66504085, 0.17082434], [0.22247955, -0.6932605, -0.09832993], [0.07035859, 0.00714283, 0.67810124], [0.07765351, -0.0105204, -0.44108074], [-0.00917056, 0.04322147, 0.10062478], [-0.01909512, 0.06182718, 0.28830475], [0.01756709, 0.04797666, 0.32225745]]) y_weights_sign_flip = pls_ca.y_weights_ / y_weights y_loadings = np.array([[0.68568625, 0.1674376, 0.0969508], [0.68782064, 0.20375837, -0.1164448], [0.11712173, -0.68046903, 0.12001505], [0.17860457, -0.6798319, -0.05089681], [0.06265739, -0.0277703, 0.74729584], [0.0914178, 0.00403751, -0.5135078], [-0.02196918, -0.01377169, 0.09564505], [-0.03288952, 0.09039729, 0.31858973], [0.04287624, 0.05254676, 0.27836841]]) y_loadings_sign_flip = pls_ca.y_loadings_ / y_loadings assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4) assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4) assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4) assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4) assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4) assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4) # Orthogonality of weights # ~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(pls_ca.x_weights_, "x weights are not orthogonal") check_ortho(pls_ca.y_weights_, "y weights are not orthogonal") # Orthogonality of latent scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(pls_ca.x_scores_, "x scores are not orthogonal") check_ortho(pls_ca.y_scores_, "y scores are not orthogonal") # 4) Another "Non regression test" of PLS Regression (PLS2): # Checking behavior when the first column of Y is constant # =============================================== # The results were compared against a modified version of plsreg2 # from the R-package plsdepot X = d.data Y = d.target Y[:, 0] = 1 pls_2 = pls_.PLSRegression(n_components=X.shape[1]) pls_2.fit(X, Y) x_weights = np.array([[-0.6273573, 0.007081799, 0.7786994], [-0.7493417, -0.277612681, -0.6011807], [-0.2119194, 0.960666981, -0.1794690]]) x_weights_sign_flip = pls_2.x_weights_ / x_weights x_loadings = np.array([[-0.6273512, -0.22464538, 0.7786994], [-0.6643156, -0.09871193, -0.6011807], [-0.5125877, 1.01407380, -0.1794690]]) x_loadings_sign_flip = pls_2.x_loadings_ / x_loadings y_loadings = np.array([[0.0000000, 0.0000000, 0.0000000], [0.4357300, 0.5828479, 0.2174802], [-0.1353739, -0.2486423, -0.1810386]]) # R/python sign flip should be the same in x_weight and x_rotation assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4) # This test that R / python give the same result up to column # sign indeterminacy assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4) assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4) # For the PLSRegression with default parameters, it holds that # y_loadings==y_weights. In this case we only test that R/python # give the same result for the y_loadings irrespective of the sign assert_array_almost_equal(np.abs(pls_2.y_loadings_), np.abs(y_loadings), 4)
######################### import stuff ########################## import numpy as np import pandas as pd import tensorflow as tf from sklearn.datasets import load_linnerud from sklearn.model_selection import train_test_split ######################## prepare the data ######################## X, y = load_linnerud(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=5) ######################## set learning variables ################## learning_rate = 0.0005 epochs = 2000 batch_size = 3 ######################## set some variables ####################### x = tf.placeholder(tf.float32, [None, 3], name='x') # 3 features y = tf.placeholder(tf.float32, [None, 3], name='y') # 3 outputs # hidden layer 1 W1 = tf.Variable(tf.truncated_normal([3, 10], stddev=0.03), name='W1') b1 = tf.Variable(tf.truncated_normal([10]), name='b1') # hidden layer 2 W2 = tf.Variable(tf.truncated_normal([10, 3], stddev=0.03), name='W2') b2 = tf.Variable(tf.truncated_normal([3]), name='b2')
def create_linnerud(): linnerud_data = datasets.load_linnerud() assert False
assert sm.var_test_scaler.mean_.all() == np.array([[7.8, 154.8, 104.4]]).all() assert sm.var_test_scaler.var_.all() == np.array( [[34.16, 5246.16, 6053.44]]).all() assert sm.obj_train_scaler.mean_.all() == np.array( [[173.06666667, 34.66666667, 56.53333333]]).all() assert sm.obj_train_scaler.var_.all() == np.array( [[341.79555556, 4.35555556, 58.38222222]]).all() assert sm.obj_test_scaler.mean_.all() == np.array([[195.2, 37.6, 54.8]]).all() assert sm.obj_test_scaler.var_.all() == np.array([[923.76, 19.44, 20.16]]).all() model = tm.Surrogate_Models() variables, objectives = datasets.load_linnerud(return_X_y=True) model.random = 57757 model.update_database(np.ndarray.tolist(variables), np.ndarray.tolist(objectives)) model._initialize_models() def test_initialize_models(): models = model.models assert 'lr' in models assert 'pr' in models assert 'mars' in models assert 'gpr' in models assert 'ann' in models assert 'rf' in models
# [ 10.73412075 166.87684314 85.9018666 174.46486234 34.77098762 # 56.57285616] # [ 18.69314353 299.06132114 182.60275097 148.83514259 30.87234812 # 59.50363423] # [ 7.65798496 115.78798285 48.52729701 184.37066362 36.2777988 # 55.4401202 ] # [ 13.25814681 208.79619397 116.56838932 166.33696994 33.5346213 # 57.50228687] # [ 13.26989716 208.9913453 116.71115423 166.29913135 33.52886552 # 57.50661375] # [ 4.69419072 66.56490544 12.51765966 193.91470161 37.7295807 # 54.34875215] # [ 5.34592844 77.38904966 20.43617123 191.8159697 37.41033417 # 54.58874375] # [ 13.13301791 206.71803705 115.04809272 166.7399112 33.59591431 # 57.45621023] # [ 4.9236322 70.37549919 15.30533784 193.175852 37.61719133 # 54.43324017] # [ 13.10738592 206.29233749 114.73666792 166.8224516 33.60846986 # 57.44677168] # [ 12.73830285 200.16255828 110.25236629 168.01097635 33.78926113 # 57.31086296] # [ 7.62722272 115.27707959 48.15354059 184.46972448 36.29286734 # 55.42879251]] # 5) # 0.78 ll = load_linnerud() print(ll.DESCR)
def scikitAlgorithms_UCIDataset(input_dict): from sklearn import datasets allDSets = {"iris":datasets.load_iris(), "boston":datasets.load_boston(), "diabetes":datasets.load_diabetes(), " linnerud":datasets.load_linnerud()} dataset = allDSets[input_dict['dsIn']] output_dict = {} output_dict['dtsOut'] = dataset#(dataset.data, dataset.target) return output_dict
def run_4(feature_to_plot): # Generate some 2D coefficients with sine waves with random frequency and phase from sklearn.datasets import load_linnerud linnerud = load_linnerud() """ print(linnerud.feature_names) print(linnerud.data) print(linnerud.target_names) print(linnerud.target) """ X, Y = linnerud.data, linnerud.target # [print(y) for y in Y.T] coef_ridge_ = np.array([Ridge(alpha=0.5).fit(X, y).coef_ for y in Y.T]) coef_lasso_ = np.array([Lasso(alpha=0.5).fit(X, y).coef_ for y in Y.T]) coef_multi_task_lasso_ = MultiTaskLasso(alpha=1.).fit(X, Y).coef_ coef_low_ranked_ = low_ranked_regression(X, Y, 3) # ############################################################################# # Plot support and time series fig = plt.figure(figsize=(8, 5)) plt.subplot(1, 2, 1) plt.spy(coef_lasso_) plt.xlabel('Feature') plt.ylabel('Time (or Task)') plt.text(10, 5, 'Lasso') plt.subplot(1, 2, 2) plt.spy(coef_multi_task_lasso_) plt.xlabel('Feature') plt.ylabel('Time (or Task)') plt.text(10, 5, 'MultiTaskLasso') fig.suptitle('Coefficient non-zero location') plt.tight_layout() plt.figure() lw = 1 """ plt.plot(Y[:, feature_to_plot], color='seagreen', linewidth=lw, label='Ground truth') """ plt.plot(coef_lasso_[:, feature_to_plot], color='cornflowerblue', linewidth=lw, label='Lasso') plt.plot(coef_ridge_[:, feature_to_plot], color='red', linewidth=lw, label='Ridge') plt.plot(coef_low_ranked_[:, feature_to_plot], color='magenta', linewidth=lw, label='LowRanked') plt.plot(coef_multi_task_lasso_[:, feature_to_plot], color='gold', linewidth=lw, label='MultiTaskLasso') plt.legend(loc='upper center') plt.axis('tight') plt.ylim([-1.1, 1.1]) plt.tight_layout() plt.show()
print() print('--------------------------乳腺癌数据集------------------------') from sklearn.datasets import load_breast_cancer breast_cancer = load_breast_cancer() print('简单经典的用于二分类任务的数据集') print('数据属性', breast_cancer.keys()) print() print('其他数据集') from sklearn.datasets import load_boston from sklearn.datasets import load_diabetes from sklearn.datasets import load_digits from sklearn.datasets import load_linnerud from sklearn.datasets import load_wine print('波士顿房价数据集load-boston是经典的用于回归任务的数据集') print('波士顿房价数据集的数据属性为', load_boston().keys()) print('糖尿病数据集load-diabetes是经典的用于回归任务的数据集') print('手写数字数据集load_digits适用于多分类任务的数据集') print('体能训练数据集load-linnerud是经典的用于多变量回归任务的数据集,其内部包含两个小数据集' ':Excise是对三个训练变量(引体向上、仰卧起坐、立定跳远)的20次观测;physiological是对' '三个生理学变量(体重、腰围、脉搏)的20次观测') print(load_linnerud().keys()) print(load_linnerud().target) print(load_linnerud().target_names) print(load_linnerud().feature_names) print('葡萄酒数据集load-wine包括了3中酒中13中不同成分的数量,共178个样本,对应三种葡萄酒') print(load_wine().target_names) print(load_wine().keys()) print(load_wine().feature_names)
def experimentVariables(projectName): ''' This function returns all the variables necessary to start a experiment including: name of the experiment datasets locations variables to report from the experiment what to report from the experiment ''' computerName=os.environ.get('COMPUTERNAME') if projectName== 'unlabeledModelS': if computerName=='JULIAN': from sklearn.datasets import load_boston, load_iris, load_diabetes, load_digits, load_linnerud datasets={'boston':load_boston(),'iris':load_iris(),'diabetes':load_diabetes(),'digits':load_digits(),'linnerud':load_linnerud()} print('working at JULIAN@CMU') dataset='digits' numTests=50 experimentName='One' agmntlvl=0 description='here the description of this experiment' data=datasets[dataset]['data'] labels=datasets[dataset]['target'] verbose=0 plots=False signal2plot='f1_score_mv_predval_agmnt' # signal2plot='f1_score_val_predval_agmnt' #The next are variables that store the outcomes from the #experiment variables=\ 'spear=[]\n' return {'dataset':dataset,'numTests':numTests,'experimentName':experimentName,\ 'description':description,'agmntlvl':agmntlvl,'variables':variables,'data':data,\ 'labels':labels,'verbose':verbose,'plots':plots,'signal2plot':signal2plot} else: print('Variables not defined for Julian@Laptop')
def loadRegSample(self): ''' load regression sample dataset ''' self.data = load_linnerud() logger.info(self.data.DESCR)
def test_predictions(): d = load_linnerud() X = d.data Y = d.target tol = 5e-12 miter = 1000 num_comp = 2 Xorig = X.copy() Yorig = Y.copy() # SSY = np.sum(Yorig**2) # center = True scale = False pls1 = PLSRegression(n_components = num_comp, scale = scale, tol = tol, max_iter = miter, copy = True) pls1.fit(Xorig, Yorig) Yhat1 = pls1.predict(Xorig) SSYdiff1 = np.sum((Yorig-Yhat1)**2) # print "PLSRegression: R2Yhat = %.4f" % (1 - (SSYdiff1 / SSY)) # Compare PLSR and sklearn.PLSRegression pls3 = PLSR(num_comp = num_comp, center = True, scale = scale, tolerance = tol, max_iter = miter) pls3.fit(X, Y) Yhat3 = pls3.predict(X) assert_array_almost_equal(Yhat1, Yhat3, decimal = 5, err_msg = "PLSR gives wrong prediction") SSYdiff3 = np.sum((Yorig-Yhat3)**2) # print "PLSR : R2Yhat = %.4f" % (1 - (SSYdiff3 / SSY)) assert abs(SSYdiff1 - SSYdiff3) < 0.00005 pls2 = PLSCanonical(n_components = num_comp, scale = scale, tol = tol, max_iter = miter, copy = True) pls2.fit(Xorig, Yorig) Yhat2 = pls2.predict(Xorig) SSYdiff2 = np.sum((Yorig-Yhat2)**2) # print "PLSCanonical : R2Yhat = %.4f" % (1 - (SSYdiff2 / SSY)) # Compare PLSC and sklearn.PLSCanonical pls4 = PLSC(num_comp = num_comp, center = True, scale = scale, tolerance = tol, max_iter = miter) pls4.fit(X, Y) Yhat4 = pls4.predict(X) SSYdiff4 = np.sum((Yorig-Yhat4)**2) # print "PLSC : R2Yhat = %.4f" % (1 - (SSYdiff4 / SSY)) # Compare O2PLS and sklearn.PLSCanonical pls5 = O2PLS(num_comp = [num_comp, 1, 0], center = True, scale = scale, tolerance = tol, max_iter = miter) pls5.fit(X, Y) Yhat5 = pls5.predict(X) SSYdiff5 = np.sum((Yorig-Yhat5)**2) # print "O2PLS : R2Yhat = %.4f" % (1 - (SSYdiff5 / SSY)) assert abs(SSYdiff2 - SSYdiff4) < 0.00005 assert SSYdiff2 > SSYdiff5
def test_update_database(): sm = tm.Surrogate_Models() variables, objectives = datasets.load_linnerud(return_X_y=True) sm.random = 57757 sm.update_database(np.ndarray.tolist(variables), np.ndarray.tolist(objectives)) ind_var_given = [[ 11, 230, 80, ], [ 6, 70, 31, ], [ 2, 110, 43, ], [ 14, 215, 105, ], [ 15, 225, 73, ], [ 4, 60, 25, ], [ 12, 105, 37, ], [ 12, 101, 101, ], [ 13, 210, 115, ], [ 13, 155, 58, ], [ 2, 110, 60, ], [ 15, 200, 40, ], [ 6, 125, 40, ], [ 8, 101, 38, ], [ 17, 120, 38, ]] obj_var_given = [[ 157, 32, 52, ], [ 193, 36, 46, ], [ 138, 33, 68, ], [ 154, 34, 64, ], [ 156, 33, 54, ], [ 176, 37, 54, ], [ 162, 35, 62, ], [ 193, 38, 58, ], [ 166, 33, 52, ], [ 189, 35, 46, ], [ 189, 37, 52, ], [ 176, 31, 74, ], [ 167, 34, 60, ], [ 211, 38, 56, ], [ 169, 34, 50, ]] np.testing.assert_array_equal(sm.var_train, ind_var_given) np.testing.assert_array_equal(sm.obj_train, obj_var_given) assert len(sm.var_test) == 5 assert len(sm.obj_test) == 5 sm.update_database([ [ 12, 250, 85, ], [ 12, 250, 85, ], ], [[ 165, 33, 57, ], [ 165, 33, 57, ]]) assert len(sm.var_train) == 16 assert len(sm.obj_train) == 16 assert len(sm.var_test) == 6 assert len(sm.obj_test) == 6
import numpy as np import pandas as pd # Load the dataset from sklearn.datasets import load_linnerud linnerud_data = load_linnerud() X = linnerud_data.data y = linnerud_data.target from sklearn.tree import DecisionTreeRegressor from sklearn.metrics import mean_absolute_error as mae from sklearn.linear_model import LinearRegression from sklearn import cross_validation from sklearn.metrics import mean_squared_error as mse x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, y) reg1 = DecisionTreeRegressor() reg1.fit(x_train, y_train) mean_absolute_error_tree = mae(reg1.predict(x_test), y_test) mean_squared_error_tree = mse(reg1.predict(x_test), y_test) print "Decision Tree mean absolute error: {:.2f}".format( mean_absolute_error_tree) print "Decision Tree mean absolute error: {:.2f}".format( mean_squared_error_tree) reg2 = LinearRegression() reg2.fit(x_train, y_train) mean_absolute_error_linear = mae(reg2.predict(x_test), y_test) mean_squared_error_linear = mse(reg2.predict(x_test), y_test)
x_data = np.zeros([20,2]) x_data[:,0] = 1 x_data[:,1] = x y_data = y y_data = np.expand_dims(y_data,axis=1) # compute the weights W = np.dot(np.dot(inv((np.dot(x_data.T,x_data))),x_data.T),y_data) return W # -- Get data data_set = load_linnerud() raw_data = data_set.data # Chins, Situps, Jumps features_names = data_set.feature_names target_data = data_set.target # Weight, Waist, Pulse target_names = data_set.target_names fig, axis = plt.subplots(3, 3) fig.set_size_inches(20,25) for i in range(len(target_names)): x_temp = target_data[:,i] for j in range(len(features_names)):
def test_linnerud_data(): X, y = load_linnerud(return_X_y=True) assert apply_toy_on(X, y) > -3000
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, linear_model from sklearn.metrics import mean_squared_error, r2_score from mpl_toolkits.mplot3d import Axes3D lin = datasets.load_linnerud() lin_features = lin.feature_names X = lin.data[:, np.newaxis, 2] X_train = X[:-15] X_test = X[-15:] y_train = lin.target[:-15] y_test = lin.target[-15:] regression = linear_model.LinearRegression() # training regression.fit(X_train, y_train) # prediction y_pred = regression.predict(X_test) print('Coeficients: \n', regression.coef_) print("Mean squared error: \n %.2f" % mean_squared_error(y_test, y_pred)) print('Variance Score: \n %.2f' %r2_score(y_test, y_pred))
#exec(open('.\\trees\\sklearn\\datasets.py').read()) import subprocess as sp from sklearn.datasets import load_boston from sklearn.datasets import load_iris from sklearn.datasets import load_diabetes from sklearn.datasets import load_digits from sklearn.datasets import load_linnerud from sklearn.datasets import load_wine from sklearn.datasets import load_breast_cancer if __name__ == '__main__': sp.call('cls', shell=True) # load the iris dataset ds = dict() ds['iris'] = load_iris() ds['boston'] = load_boston() ds['iris'] = load_iris() ds['diabetes'] = load_diabetes() ds['digits'] = load_digits() ds['linnerud'] = load_linnerud() ds['wine'] = load_wine() ds['breastcancer'] = load_breast_cancer() # print the keys of every dataset for key in ds: print(key) for key2 in ds[key]: print('{0}{1}'.format(' ', key2))
import numpy as np from numpy.testing import assert_array_almost_equal from sklearn.datasets import load_linnerud from sklearn import pls d = load_linnerud() X = d.data Y = d.target def test_pls(): n_components = 2 # 1) Canonical (symetric) PLS (PLS 2 blocks canonical mode A) # =========================================================== # Compare 2 algo.: nipals vs. svd # ------------------------------ pls_bynipals = pls.PLSCanonical(n_components=n_components) pls_bynipals.fit(X, Y) pls_bysvd = pls.PLSCanonical(algorithm="svd", n_components=n_components) pls_bysvd.fit(X, Y) # check that the loading vectors are highly correlated assert_array_almost_equal( [ np.abs(np.corrcoef(pls_bynipals.x_loadings_[:, k], pls_bysvd.x_loadings_[:, k])[1, 0]) for k in xrange(n_components) ], np.ones(n_components), err_msg="nipals and svd implementation lead to different x loadings", ) assert_array_almost_equal(
def load_multi_data(self): x, y = datasets.load_linnerud(return_X_y=True) self.__inputs = x.tolist() self.__outputs = y.tolist()
#solution_dsci_chapter_02_diabetes.py """Data from scikit-learn's traing data, and are based on clinical data available here: http://www4.stat.ncsu.edu/~boos/var.select/diabetes.html Documentation for this and lots of other machine learning sets can be found here: http://scikit-learn.org/stable/datasets/index.html """ import sklearn.datasets as ds import pandas as pd #everything=ds.load_diabetes() everything=ds.load_linnerud() #build DataFrame objects with contents of the data set eser=pd.DataFrame(everything['data'], columns=everything['feature_names']) pser=pd.DataFrame(everything['target'], columns=everything['target_names']) #combine the two series a column at a time for c in pser.columns: eser.assign(c=pser[c], inplace=True) #alternative: eser[c]=pser[c] x=1 #create an "exercise index" eser.assign(eindex=eser['Chins'] * 3 + eser["Situps"] * 2 + eser["Jumps"]) #convert the Weight to kilos (2.2 lb = 1 kg) eser['Weight']=eser['Weight']/2.2
# This tutorial is more about understanding datasets in scikit # Toy Dataset # By default scikit comes with preloaded data set for practicing machine leaning algorithms. from sklearn import datasets iris_data = datasets.load_iris() # Classification wine_data = datasets.load_wine() # Classification cancer_data = datasets.load_breast_cancer() # Classification diabetes_data = datasets.load_diabetes() # Regression boston_data = datasets.load_boston() # Regression linnerud_data = datasets.load_linnerud() # Multivariate Regression # Accessing data # By default datasets return Bunch - Dictionary like object. # Bunch has target variable (Y) and feature variable (X). X = iris_data.get("data") Y = iris_data.get("target") feature_names = iris_data.get("feature_names") target_names = iris_data.get("target_names") # nothing but lable name X = diabetes_data.get("data") Y = diabetes_data.get("target") feature_names = diabetes_data.get("feature_names") # for regression problems there is no target_names # for more details about dataset visit # http://scikit-learn.org/stable/datasets/index.html#datasets
# coding=utf-8 from sklearn import datasets import numpy as np import matplotlib.pyplot as plt d1 = datasets.load_iris() # d2 = datasets.load_breast_cancer() #乳腺癌数据 d3 = datasets.load_digits() #手写数字 d4 = datasets.load_boston() #波士顿房价 d5 = datasets.load_linnerud() #体能数据集 print(d3.keys()) samples,features = d3.data.shape print(samples,features) print(d3.images.shape) #print(d1.data) #print(d1.target) print(d3.target_names) print(np.bincount(d1.target)) x_index = 3 colors = ['blue','red','green'] ''' for label,color in zip(range(len(d1.target_names)),colors): plt.hist(d1.data[d1.target==label,x_index],label=d1.target_names[label],color=color) #直方图
def test_pls(): d = load_linnerud() X = d.data Y = d.target # 1) Canonical (symetric) PLS (PLS 2 blocks canonical mode A) # =========================================================== # Compare 2 algo.: nipals vs. svd # ------------------------------ pls_bynipals = pls.PLSCanonical(n_components=X.shape[1]) pls_bynipals.fit(X, Y) pls_bysvd = pls.PLSCanonical(algorithm="svd", n_components=X.shape[1]) pls_bysvd.fit(X, Y) # check equalities of loading (up to the sign of the second column) assert_array_almost_equal( pls_bynipals.x_loadings_, np.multiply(pls_bysvd.x_loadings_, np.array([1, -1, 1])), decimal=5, err_msg="nipals and svd implementation lead to different x loadings") assert_array_almost_equal( pls_bynipals.y_loadings_, np.multiply(pls_bysvd.y_loadings_, np.array([1, -1, 1])), decimal=5, err_msg="nipals and svd implementation lead to different y loadings") # Check PLS properties (with n_components=X.shape[1]) # --------------------------------------------------- plsca = pls.PLSCanonical(n_components=X.shape[1]) plsca.fit(X, Y) T = plsca.x_scores_ P = plsca.x_loadings_ Wx = plsca.x_weights_ U = plsca.y_scores_ Q = plsca.y_loadings_ Wy = plsca.y_weights_ def check_ortho(M, err_msg): K = np.dot(M.T, M) assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg) # Orthogonality of weights # ~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(Wx, "x weights are not orthogonal") check_ortho(Wy, "y weights are not orthogonal") # Orthogonality of latent scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(T, "x scores are not orthogonal") check_ortho(U, "y scores are not orthogonal") # Check X = TP' and Y = UQ' (with (p == q) components) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # center scale X, Y Xc, Yc, x_mean, y_mean, x_std, y_std =\ pls._center_scale_xy(X.copy(), Y.copy(), scale=True) assert_array_almost_equal(Xc, np.dot(T, P.T), err_msg="X != TP'") assert_array_almost_equal(Yc, np.dot(U, Q.T), err_msg="Y != UQ'") # Check that rotations on training data lead to scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Xr = plsca.transform(X) assert_array_almost_equal(Xr, plsca.x_scores_, err_msg="rotation on X failed") Xr, Yr = plsca.transform(X, Y) assert_array_almost_equal(Xr, plsca.x_scores_, err_msg="rotation on X failed") assert_array_almost_equal(Yr, plsca.y_scores_, err_msg="rotation on Y failed") # "Non regression test" on canonical PLS # -------------------------------------- # The results were checked against the R-package plspm pls_ca = pls.PLSCanonical(n_components=X.shape[1]) pls_ca.fit(X, Y) x_weights = np.array( [[-0.61330704, 0.25616119, -0.74715187], [-0.74697144, 0.11930791, 0.65406368], [-0.25668686, -0.95924297, -0.11817271]]) assert_array_almost_equal(pls_ca.x_weights_, x_weights) x_rotations = np.array( [[-0.61330704, 0.41591889, -0.62297525], [-0.74697144, 0.31388326, 0.77368233], [-0.25668686, -0.89237972, -0.24121788]]) assert_array_almost_equal(pls_ca.x_rotations_, x_rotations) y_weights = np.array( [[+0.58989127, 0.7890047, 0.1717553], [+0.77134053, -0.61351791, 0.16920272], [-0.23887670, -0.03267062, 0.97050016]]) assert_array_almost_equal(pls_ca.y_weights_, y_weights) y_rotations = np.array( [[+0.58989127, 0.7168115, 0.30665872], [+0.77134053, -0.70791757, 0.19786539], [-0.23887670, -0.00343595, 0.94162826]]) assert_array_almost_equal(pls_ca.y_rotations_, y_rotations) # 2) Regression PLS (PLS2): "Non regression test" # =============================================== # The results were checked against the R-packages plspm, misOmics and pls pls_2 = pls.PLSRegression(n_components=X.shape[1]) pls_2.fit(X, Y) x_weights = np.array( [[-0.61330704, -0.00443647, 0.78983213], [-0.74697144, -0.32172099, -0.58183269], [-0.25668686, 0.94682413, -0.19399983]]) assert_array_almost_equal(pls_2.x_weights_, x_weights) x_loadings = np.array( [[-0.61470416, -0.24574278, 0.78983213], [-0.65625755, -0.14396183, -0.58183269], [-0.51733059, 1.00609417, -0.19399983]]) assert_array_almost_equal(pls_2.x_loadings_, x_loadings) y_weights = np.array( [[+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916]]) assert_array_almost_equal(pls_2.y_weights_, y_weights) y_loadings = np.array( [[+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916]]) assert_array_almost_equal(pls_2.y_loadings_, y_loadings) # 3) Another non-regression test of Canonical PLS on random dataset # ================================================================= # The results were checked against the R-package plspm n = 500 p_noise = 10 q_noise = 5 # 2 latents vars: np.random.seed(11) l1 = np.random.normal(size=n) l2 = np.random.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + np.random.normal(size=4 * n).reshape((n, 4)) Y = latents + np.random.normal(size=4 * n).reshape((n, 4)) X = np.concatenate( (X, np.random.normal(size=p_noise * n).reshape(n, p_noise)), axis=1) Y = np.concatenate( (Y, np.random.normal(size=q_noise * n).reshape(n, q_noise)), axis=1) np.random.seed(None) pls_ca = pls.PLSCanonical(n_components=3) pls_ca.fit(X, Y) x_weights = np.array( [[0.65803719, 0.19197924, 0.21769083], [0.7009113, 0.13303969, -0.15376699], [0.13528197, -0.68636408, 0.13856546], [0.16854574, -0.66788088, -0.12485304], [-0.03232333, -0.04189855, 0.40690153], [0.1148816, -0.09643158, 0.1613305], [0.04792138, -0.02384992, 0.17175319], [-0.06781, -0.01666137, -0.18556747], [-0.00266945, -0.00160224, 0.11893098], [-0.00849528, -0.07706095, 0.1570547], [-0.00949471, -0.02964127, 0.34657036], [-0.03572177, 0.0945091, 0.3414855], [0.05584937, -0.02028961, -0.57682568], [0.05744254, -0.01482333, -0.17431274]]) assert_array_almost_equal(pls_ca.x_weights_, x_weights) x_loadings = np.array( [[0.65649254, 0.1847647, 0.15270699], [0.67554234, 0.15237508, -0.09182247], [0.19219925, -0.67750975, 0.08673128], [0.2133631, -0.67034809, -0.08835483], [-0.03178912, -0.06668336, 0.43395268], [0.15684588, -0.13350241, 0.20578984], [0.03337736, -0.03807306, 0.09871553], [-0.06199844, 0.01559854, -0.1881785], [0.00406146, -0.00587025, 0.16413253], [-0.00374239, -0.05848466, 0.19140336], [0.00139214, -0.01033161, 0.32239136], [-0.05292828, 0.0953533, 0.31916881], [0.04031924, -0.01961045, -0.65174036], [0.06172484, -0.06597366, -0.1244497]]) assert_array_almost_equal(pls_ca.x_loadings_, x_loadings) y_weights = np.array( [[0.66101097, 0.18672553, 0.22826092], [0.69347861, 0.18463471, -0.23995597], [0.14462724, -0.66504085, 0.17082434], [0.22247955, -0.6932605, -0.09832993], [0.07035859, 0.00714283, 0.67810124], [0.07765351, -0.0105204, -0.44108074], [-0.00917056, 0.04322147, 0.10062478], [-0.01909512, 0.06182718, 0.28830475], [0.01756709, 0.04797666, 0.32225745]]) assert_array_almost_equal(pls_ca.y_weights_, y_weights) y_loadings = np.array( [[0.68568625, 0.1674376, 0.0969508], [0.68782064, 0.20375837, -0.1164448], [0.11712173, -0.68046903, 0.12001505], [0.17860457, -0.6798319, -0.05089681], [0.06265739, -0.0277703, 0.74729584], [0.0914178, 0.00403751, -0.5135078], [-0.02196918, -0.01377169, 0.09564505], [-0.03288952, 0.09039729, 0.31858973], [0.04287624, 0.05254676, 0.27836841]]) assert_array_almost_equal(pls_ca.y_loadings_, y_loadings) # Orthogonality of weights # ~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(pls_ca.x_weights_, "x weights are not orthogonal") check_ortho(pls_ca.y_weights_, "y weights are not orthogonal") # Orthogonality of latent scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(pls_ca.x_scores_, "x scores are not orthogonal") check_ortho(pls_ca.y_scores_, "y scores are not orthogonal")
def test_pls(): d = load_linnerud() X = d.data Y = d.target # 1) Canonical (symmetric) PLS (PLS 2 blocks canonical mode A) # =========================================================== # Compare 2 algo.: nipals vs. svd # ------------------------------ pls_bynipals = pls_.PLSCanonical(n_components=X.shape[1]) pls_bynipals.fit(X, Y) pls_bysvd = pls_.PLSCanonical(algorithm="svd", n_components=X.shape[1]) pls_bysvd.fit(X, Y) # check equalities of loading (up to the sign of the second column) assert_array_almost_equal( pls_bynipals.x_loadings_, pls_bysvd.x_loadings_, decimal=5, err_msg="nipals and svd implementations lead to different x loadings") assert_array_almost_equal( pls_bynipals.y_loadings_, pls_bysvd.y_loadings_, decimal=5, err_msg="nipals and svd implementations lead to different y loadings") # Check PLS properties (with n_components=X.shape[1]) # --------------------------------------------------- plsca = pls_.PLSCanonical(n_components=X.shape[1]) plsca.fit(X, Y) T = plsca.x_scores_ P = plsca.x_loadings_ Wx = plsca.x_weights_ U = plsca.y_scores_ Q = plsca.y_loadings_ Wy = plsca.y_weights_ def check_ortho(M, err_msg): K = np.dot(M.T, M) assert_array_almost_equal(K, np.diag(np.diag(K)), err_msg=err_msg) # Orthogonality of weights # ~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(Wx, "x weights are not orthogonal") check_ortho(Wy, "y weights are not orthogonal") # Orthogonality of latent scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(T, "x scores are not orthogonal") check_ortho(U, "y scores are not orthogonal") # Check X = TP' and Y = UQ' (with (p == q) components) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # center scale X, Y Xc, Yc, x_mean, y_mean, x_std, y_std =\ pls_._center_scale_xy(X.copy(), Y.copy(), scale=True) assert_array_almost_equal(Xc, np.dot(T, P.T), err_msg="X != TP'") assert_array_almost_equal(Yc, np.dot(U, Q.T), err_msg="Y != UQ'") # Check that rotations on training data lead to scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Xr = plsca.transform(X) assert_array_almost_equal(Xr, plsca.x_scores_, err_msg="rotation on X failed") Xr, Yr = plsca.transform(X, Y) assert_array_almost_equal(Xr, plsca.x_scores_, err_msg="rotation on X failed") assert_array_almost_equal(Yr, plsca.y_scores_, err_msg="rotation on Y failed") # "Non regression test" on canonical PLS # -------------------------------------- # The results were checked against the R-package plspm pls_ca = pls_.PLSCanonical(n_components=X.shape[1]) pls_ca.fit(X, Y) x_weights = np.array( [[-0.61330704, 0.25616119, -0.74715187], [-0.74697144, 0.11930791, 0.65406368], [-0.25668686, -0.95924297, -0.11817271]]) # x_weights_sign_flip holds columns of 1 or -1, depending on sign flip # between R and python x_weights_sign_flip = pls_ca.x_weights_ / x_weights x_rotations = np.array( [[-0.61330704, 0.41591889, -0.62297525], [-0.74697144, 0.31388326, 0.77368233], [-0.25668686, -0.89237972, -0.24121788]]) x_rotations_sign_flip = pls_ca.x_rotations_ / x_rotations y_weights = np.array( [[+0.58989127, 0.7890047, 0.1717553], [+0.77134053, -0.61351791, 0.16920272], [-0.23887670, -0.03267062, 0.97050016]]) y_weights_sign_flip = pls_ca.y_weights_ / y_weights y_rotations = np.array( [[+0.58989127, 0.7168115, 0.30665872], [+0.77134053, -0.70791757, 0.19786539], [-0.23887670, -0.00343595, 0.94162826]]) y_rotations_sign_flip = pls_ca.y_rotations_ / y_rotations # x_weights = X.dot(x_rotation) # Hence R/python sign flip should be the same in x_weight and x_rotation assert_array_almost_equal(x_rotations_sign_flip, x_weights_sign_flip) # This test that R / python give the same result up to column # sign indeterminacy assert_array_almost_equal(np.abs(x_rotations_sign_flip), 1, 4) assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4) assert_array_almost_equal(y_rotations_sign_flip, y_weights_sign_flip) assert_array_almost_equal(np.abs(y_rotations_sign_flip), 1, 4) assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4) # 2) Regression PLS (PLS2): "Non regression test" # =============================================== # The results were checked against the R-packages plspm, misOmics and pls pls_2 = pls_.PLSRegression(n_components=X.shape[1]) pls_2.fit(X, Y) x_weights = np.array( [[-0.61330704, -0.00443647, 0.78983213], [-0.74697144, -0.32172099, -0.58183269], [-0.25668686, 0.94682413, -0.19399983]]) x_weights_sign_flip = pls_2.x_weights_ / x_weights x_loadings = np.array( [[-0.61470416, -0.24574278, 0.78983213], [-0.65625755, -0.14396183, -0.58183269], [-0.51733059, 1.00609417, -0.19399983]]) x_loadings_sign_flip = pls_2.x_loadings_ / x_loadings y_weights = np.array( [[+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916]]) y_weights_sign_flip = pls_2.y_weights_ / y_weights y_loadings = np.array( [[+0.32456184, 0.29892183, 0.20316322], [+0.42439636, 0.61970543, 0.19320542], [-0.13143144, -0.26348971, -0.17092916]]) y_loadings_sign_flip = pls_2.y_loadings_ / y_loadings # x_loadings[:, i] = Xi.dot(x_weights[:, i]) \forall i assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4) assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4) assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4) assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4) assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4) assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4) # 3) Another non-regression test of Canonical PLS on random dataset # ================================================================= # The results were checked against the R-package plspm n = 500 p_noise = 10 q_noise = 5 # 2 latents vars: rng = check_random_state(11) l1 = rng.normal(size=n) l2 = rng.normal(size=n) latents = np.array([l1, l1, l2, l2]).T X = latents + rng.normal(size=4 * n).reshape((n, 4)) Y = latents + rng.normal(size=4 * n).reshape((n, 4)) X = np.concatenate( (X, rng.normal(size=p_noise * n).reshape(n, p_noise)), axis=1) Y = np.concatenate( (Y, rng.normal(size=q_noise * n).reshape(n, q_noise)), axis=1) pls_ca = pls_.PLSCanonical(n_components=3) pls_ca.fit(X, Y) x_weights = np.array( [[0.65803719, 0.19197924, 0.21769083], [0.7009113, 0.13303969, -0.15376699], [0.13528197, -0.68636408, 0.13856546], [0.16854574, -0.66788088, -0.12485304], [-0.03232333, -0.04189855, 0.40690153], [0.1148816, -0.09643158, 0.1613305], [0.04792138, -0.02384992, 0.17175319], [-0.06781, -0.01666137, -0.18556747], [-0.00266945, -0.00160224, 0.11893098], [-0.00849528, -0.07706095, 0.1570547], [-0.00949471, -0.02964127, 0.34657036], [-0.03572177, 0.0945091, 0.3414855], [0.05584937, -0.02028961, -0.57682568], [0.05744254, -0.01482333, -0.17431274]]) x_weights_sign_flip = pls_ca.x_weights_ / x_weights x_loadings = np.array( [[0.65649254, 0.1847647, 0.15270699], [0.67554234, 0.15237508, -0.09182247], [0.19219925, -0.67750975, 0.08673128], [0.2133631, -0.67034809, -0.08835483], [-0.03178912, -0.06668336, 0.43395268], [0.15684588, -0.13350241, 0.20578984], [0.03337736, -0.03807306, 0.09871553], [-0.06199844, 0.01559854, -0.1881785], [0.00406146, -0.00587025, 0.16413253], [-0.00374239, -0.05848466, 0.19140336], [0.00139214, -0.01033161, 0.32239136], [-0.05292828, 0.0953533, 0.31916881], [0.04031924, -0.01961045, -0.65174036], [0.06172484, -0.06597366, -0.1244497]]) x_loadings_sign_flip = pls_ca.x_loadings_ / x_loadings y_weights = np.array( [[0.66101097, 0.18672553, 0.22826092], [0.69347861, 0.18463471, -0.23995597], [0.14462724, -0.66504085, 0.17082434], [0.22247955, -0.6932605, -0.09832993], [0.07035859, 0.00714283, 0.67810124], [0.07765351, -0.0105204, -0.44108074], [-0.00917056, 0.04322147, 0.10062478], [-0.01909512, 0.06182718, 0.28830475], [0.01756709, 0.04797666, 0.32225745]]) y_weights_sign_flip = pls_ca.y_weights_ / y_weights y_loadings = np.array( [[0.68568625, 0.1674376, 0.0969508], [0.68782064, 0.20375837, -0.1164448], [0.11712173, -0.68046903, 0.12001505], [0.17860457, -0.6798319, -0.05089681], [0.06265739, -0.0277703, 0.74729584], [0.0914178, 0.00403751, -0.5135078], [-0.02196918, -0.01377169, 0.09564505], [-0.03288952, 0.09039729, 0.31858973], [0.04287624, 0.05254676, 0.27836841]]) y_loadings_sign_flip = pls_ca.y_loadings_ / y_loadings assert_array_almost_equal(x_loadings_sign_flip, x_weights_sign_flip, 4) assert_array_almost_equal(np.abs(x_weights_sign_flip), 1, 4) assert_array_almost_equal(np.abs(x_loadings_sign_flip), 1, 4) assert_array_almost_equal(y_loadings_sign_flip, y_weights_sign_flip, 4) assert_array_almost_equal(np.abs(y_weights_sign_flip), 1, 4) assert_array_almost_equal(np.abs(y_loadings_sign_flip), 1, 4) # Orthogonality of weights # ~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(pls_ca.x_weights_, "x weights are not orthogonal") check_ortho(pls_ca.y_weights_, "y weights are not orthogonal") # Orthogonality of latent scores # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ check_ortho(pls_ca.x_scores_, "x scores are not orthogonal") check_ortho(pls_ca.y_scores_, "y scores are not orthogonal")
def test_load_linnerud(): res = load_linnerud() assert_equal(res.data.shape, (20, 3)) assert_equal(res.target.shape, (20, 3)) assert_equal(len(res.target_names), 3) assert_true(res.DESCR)
# y_test = _make_matrix_classes(y_test) # rna.fit(X_train, y_train) # y_pred = rna.predict(X_test) # s = rna.score(X_test, y_test) # print("score: {}".format(s)) with MLP( mse_target=1e-8, max_epochs=1e8, hidden_layer_sizes=(6, 30, 30, 6), output_fn=tf.nn.sigmoid, verbose=True, learning_rate=1e-6, ) as rna: linnerud = load_linnerud() ss = ShuffleSplit() train_index, test_index = next(iter(ss.split(linnerud.data))) X_train = linnerud.data[train_index] y_train = linnerud.target[train_index] X_test = linnerud.data[test_index] y_test = linnerud.target[test_index] X_train = np.array(X_train) y_train = np.array(y_train) X_test = np.array(X_test) y_test = np.array(y_test) norm_x = MinMaxScaler(feature_range=(0.0, 1.0), copy=True)
iris = datasets.load_breast_cancer() print("the output of load_breast_cancer() :: ", iris) #load_diabetes() excuted iris = datasets.load_diabetes() print("the output of load_diabetes() :: ", iris) #load_digits() excuted iris = datasets.load_digits() print("the output of load_digits() :: ", iris) #load_iris() excuted print("the output of load_iris() :: ", datasets.load_iris()) #load_linnerud() excuted print("the output of load_linnerud() :: ", datasets.load_linnerud()) #load_wine() excuted print("the output of load_wine() :: ", datasets.load_wine()) #make_blobs() excuted print("the output of make_blobs() :: ", datasets.make_blobs()) #make_circles() executed print("the output of make_circles() :: ", datasets.make_circles()) #make_classification() executed print("the output of make_classification() :: ", datasets.make_classification()) #make_friedman1() executed print("the output of make_friedman1() :: ", datasets.make_friedman1())
def test_update_database_dict(): sm = tm.Surrogate_Models() variables, objectives = datasets.load_linnerud(return_X_y=True) sm.random = 57757 test_dict = {} i = 0 for ind, obj in zip(variables, objectives): test_dict['core {}'.format(i)] = { 'independent variables': {}, 'dependent variables': {} } test_dict['core {}'.format(i)]['independent variables'] = { 'a': ind[0], 'b': ind[1], 'c': ind[2] } test_dict['core {}'.format(i)]['dependent variables'] = { 'd': obj[0], 'e': obj[1], 'f': obj[2] } i += 1 sm.update_database(['a', 'b', 'c'], ['d', 'e', 'f'], database=test_dict) ind_var_given = [[ 11, 230, 80, ], [ 6, 70, 31, ], [ 2, 110, 43, ], [ 14, 215, 105, ], [ 15, 225, 73, ], [ 4, 60, 25, ], [ 12, 105, 37, ], [ 12, 101, 101, ], [ 13, 210, 115, ], [ 13, 155, 58, ], [ 2, 110, 60, ], [ 15, 200, 40, ], [ 6, 125, 40, ], [ 8, 101, 38, ], [ 17, 120, 38, ]] obj_var_given = [[ 157, 32, 52, ], [ 193, 36, 46, ], [ 138, 33, 68, ], [ 154, 34, 64, ], [ 156, 33, 54, ], [ 176, 37, 54, ], [ 162, 35, 62, ], [ 193, 38, 58, ], [ 166, 33, 52, ], [ 189, 35, 46, ], [ 189, 37, 52, ], [ 176, 31, 74, ], [ 167, 34, 60, ], [ 211, 38, 56, ], [ 169, 34, 50, ]] np.testing.assert_array_equal(sm.var_train, ind_var_given) np.testing.assert_array_equal(sm.obj_train, obj_var_given) assert len(sm.var_test) == 5 assert len(sm.obj_test) == 5 test_dict = { 'core 100': { 'independent variables': { 'a': 12, 'b': 250, 'c': 85 }, 'dependent variables': { 'd': 165, 'e': 33, 'f': 57 } }, 'core 101': { 'independent variables': { 'a': 12, 'b': 250, 'c': 85 }, 'dependent variables': { 'd': 165, 'e': 33, 'f': 57 } } } sm.update_database(['a', 'b', 'c'], ['d', 'e', 'f'], database=test_dict) assert len(sm.var_train) == 16 assert len(sm.obj_train) == 16 assert len(sm.var_test) == 6 assert len(sm.obj_test) == 6
def main(): start_t = datetime.now() # set built-in dataset choices ds1 = datasets.load_boston() ds2 = datasets.load_iris() ds3 = datasets.load_diabetes() ds4 = datasets.load_digits() ds5 = datasets.load_linnerud() ds6 = datasets.load_wine() ds7 = datasets.load_breast_cancer() # Get user inputs -- add error handeling # when response is continous then no 'target_names' # when response is catagorical set 'target_names' print(""" Select SKLearn dataset to use: ~default is 3~ 1 - boston (response is continuous) 2 - iris (response is catagorical [3 cats]) 3 - diabetes (response is continuous) 4 - digits (response is an array) 5 - linnerud (response is catagorical array of continuous)[3 cats]) 6 - wine (response is catagorical [3 cast]) 7 - breast cancer (response is catagorical {2cats/bool]) 8 - user choice """) choice = int(input("Which set? " or 3)) if choice == 1: the_ds = ds1 elif choice == 2: the_ds = ds2 elif choice == 3: the_ds = ds3 elif choice == 4: the_ds = ds4 elif choice == 5: the_ds = ds5 elif choice == 6: the_ds = ds6 elif choice == 7: the_ds = ds7 elif choice == 8: the_ds = input("direct path to dataset: ") elif choice == "": the_ds = ds3 else: print(f"{choice} is an invalid entry. Please try again.") print(f"You chose {choice}") # turn source data into a pandas df working_df = pd.DataFrame(the_ds.data, columns=the_ds.feature_names) working_df["target"] = pd.Series(the_ds.target) # chg working_df.head() # get column names col_list = working_df.columns.values.tolist() titles = [ i.replace(")", "_").replace("(", "").replace(" ", "_") for i in col_list ] working_df.columns = titles print_heading("Original Dataset") print(working_df) # ~RESPONSE~ # y = the_ds.target # assign the response here #chg res_type = tot(the_ds.target) print_heading("Response type is " + res_type) # Group Predictor types type_mask = [tot(working_df[i]) for i in col_list] predictor_array = np.column_stack((col_list, type_mask)) pred_df = pd.DataFrame(predictor_array, columns=["Predictor", "Category"]) # ~PREDICTORS~ # Allow user to select the desired features # feature_list = predictor_select(the_ds) # enable abovce line to select only specific features to evaluate cont_feature_df = pred_df[ pred_df["Category"] == "continuous"] # where tot continuous and bool_check false try: cont_feature_df = cont_feature_df.drop("target", axis=1, inplace=True) except Exception: pass cat_feature_df = pred_df[ pred_df["Category"] != "continuous"] # where tot not continuous or bool_check true try: cat_feature_df = cat_feature_df.drop("target", axis=1, inplace=True) except Exception: pass print("No Continuous!") if cont_feature_df.empty else print( cont_feature_df) print("No Categorical!") if cat_feature_df.empty else print(cat_feature_df) # cont_feature_list = list(cont_feature_df["Predictor"]) # cat_feature_list = list(cat_feature_df["Predictor"]) # Make plots # if res_type == "continuous": cat_con_file_list = plot_cat_cont(the_ds) con_con_file_list = plot_cont_cont(the_ds) # else: con_cat_file_list = plot_cont_cat(the_ds) cat_cat_file_list = plot_cat_cat(the_ds) # Generate Report DF report_col = ( "Category", "p-val_&_t-val", "Regression", "Logistic_Regression", "Difference_with_mean", "Random_Forest", ) report_df = pd.DataFrame("", index=col_list, columns=report_col) report_df = report_df.drop(["target"]) pred_df = pred_df.set_index(["Predictor"]) pred_df = pred_df.drop(["target"]) # Update Report with data report_df.index.name = "Predictor" pred_df.index = report_df.index # add plots to report list_to_df(pred_df, report_df, "Category") temp_df = pd.DataFrame(cat_con_file_list, columns=["Logistic_Regression"]) list_to_df(temp_df, report_df, "Logistic_Regression") temp_df = pd.DataFrame(con_con_file_list, columns=["Regression"]) list_to_df(temp_df, report_df, "Regression") temp_df = pd.DataFrame(con_cat_file_list, columns=["Random_Forest"]) list_to_df(temp_df, report_df, "Random_Forest") temp_df = pd.DataFrame(cat_cat_file_list, columns=["Difference_with_mean"]) list_to_df(temp_df, report_df, "Difference_with_mean") # Save report to HTML report_df.to_html("HW_report_" + datetime.now().strftime("%Y_%m_%d-%H_%M") + ".html") print(datetime.now() - start_t) return