def FriedmanDataset_3(): d_train = datasets.make_friedman3(240, random_state=0) d_test = datasets.make_friedman3(1000, random_state=0) features_train = d_train[0] for i in range(240): features_train[i] += np.random.normal(0, features_train[i] / 3) target_train = d_train[1] features_test = d_test[0] target_test = d_test[1] return features_train, target_train, features_test, target_test
def test_make_friedman3(): X, y = make_friedman3(n_samples=5, noise=0.0, random_state=0) assert_equal(X.shape, (5, 4), "X shape mismatch") assert_equal(y.shape, (5,), "y shape mismatch") assert_array_almost_equal(y, np.arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]))
def test_regression_synthetic(): """Test on synthetic regression datasets used in Leo Breiman, `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """ random_state = check_random_state(1) regression_params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 1, 'learning_rate': 0.1, 'loss': 'ls'} # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor() clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse
def friedman3(n_samples=100, noise=0.0, random_state=None): return datasets.make_friedman3(n_samples=n_samples, noise=noise, random_state=random_state)
def test_make_friedman3(): X, y = make_friedman3(n_samples=5, noise=0.0, random_state=0) assert X.shape == (5, 4), "X shape mismatch" assert y.shape == (5, ), "y shape mismatch" assert_array_almost_equal( y, np.arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]))
def friedman_3(n, noise): """Generate the friedman_3 data set Args: n (int): number of data samples noise (float): added noise Returns: Friedman 3 data set """ return make_friedman3(n_samples=n, noise=noise)
def define_tested_reg_datasets(): gDatasets = {}; gDatasets["diabetes"] = datasets.load_diabetes() gDatasets["boston"] = datasets.load_boston() gDatasets["freidman1"] = datasets.make_friedman1(random_state=1960) gDatasets["freidman2"] = datasets.make_friedman2(random_state=1960) gDatasets["freidman3"] = datasets.make_friedman3(random_state=1960) gDatasets["RandomReg_10"] = datasets.make_regression(n_features=10, random_state=1960); gDatasets["RandomReg_100"] = datasets.make_regression(n_features=100, random_state=1960); gDatasets["RandomReg_500"] = datasets.make_regression(n_features=500, random_state=1960); return gDatasets;
def generateDatas(N, F, choice): if (choice == 'f1'): X, Y = datasets.make_friedman1(N, F, noise=1) elif (choice == 'f2'): X, Y = datasets.make_friedman2(N, F) elif (choice == 'f3'): X, Y == datasets.make_friedman3(N, F) elif (choice == 'boston'): boston = datasets.load_boston() X, Y = boston.data, boston.target return X, Y
def genFriedman(self, i=1, N=240, D=10): if i not in range(1, 4): raise Exception('not a correct dataset') if i == 1: X, Y = datasets.make_friedman1(N, D) if i == 2: X, Y = datasets.make_friedman2(N, D) if i == 3: X, Y = datasets.make_friedman3(N, D) return X, Y
def genFriedman(self, i=1, N=240, D=10): if i not in range(1,4): raise Exception('not a correct dataset') if i == 1: X, Y = datasets.make_friedman1(N, D ) if i == 2: X, Y = datasets.make_friedman2(N, D) if i == 3: X, Y = datasets.make_friedman3(N, D) return X, Y
def make_data(n_samples=1000, n_features=1, n_targets=1, informative_prop=1.0, noise=0.0, test_prop=0.1, valid_prop=0.3, method='linear'): if method == 'linear': params = dict(n_features=n_features, n_informative=int(n_features*informative_prop), noise=noise, n_targets=n_targets, n_samples=n_samples, shuffle=False, bias=0.0) X, Y = make_regression(**params) elif method == 'boston': boston = load_boston() X = boston.data Y = boston.target else: params = dict(n_samples=n_samples, n_features=n_features) X, Y = make_friedman3(n_samples=n_samples, n_features=n_features, noise=noise) X = MinMaxScaler(feature_range=(0.0,1.0)).fit_transform(X) X = X.astype(theano.config.floatX) Y = MinMaxScaler(feature_range=(0.0,1.0)).fit_transform(Y) Y = Y.astype(theano.config.floatX) if len(X.shape) > 1: n_features = X.shape[1] else: X = X.reshape(X.shape[0], -1) n_features = 1 if len(Y.shape) > 1: n_targets = Y.shape[1] else: Y = Y.reshape(Y.shape[0], -1) n_targets = 1 X_train, Y_train, X_valid, Y_valid, X_test, Y_test = \ train_valid_test_split(X, Y, test_prop=valid_prop, valid_prop=valid_prop) return dict( X_train=theano.shared(X_train), Y_train=theano.shared(Y_train), X_valid=theano.shared(X_valid), Y_valid=theano.shared(Y_valid), X_test=theano.shared(X_test), Y_test=theano.shared(Y_test), num_examples_train=X_train.shape[0], num_examples_valid=X_valid.shape[0], num_examples_test=X_test.shape[0], input_dim=n_features, output_dim=n_targets)
def main(): dir = sys.argv[1] output_csv = dir + '/friedman3/friedman3_prep.csv' names = ["x1", "x2", "x3", "x4", "y"] (X, y) = data.make_friedman3(n_samples=10000, random_state=123456, noise=0.01) y = np.matrix(y).T df = pd.DataFrame(np.append(X, y, axis=1), columns=names) df = scale(df)[1] # TODO Transform box-cox. df.to_csv(output_csv, index=False)
def generate_Dataset(name_dataset): # default ind = f(name_dataset) if ind == 1: x,y = datasets.make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None) elif ind == 2: x,y = datasets.make_friedman2(n_samples=100, noise=0.0, random_state=None) elif ind == 3: x,y = datasets.make_friedman3(n_samples=100, noise=0.0, random_state=None) else : x, y = datasets.load_boston(return_X_y=True) x = x.tolist() return x,y
def test_regression_synthetic(): # Test on synthetic regression datasets used in Leo Breiman, # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). random_state = check_random_state(1) regression_params = { 'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.1, 'loss': 'ls' } # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: clf = GradientBoostingRegressor(presort=presort) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 5.0) # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: regression_params['presort'] = presort clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 1700.0) # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: regression_params['presort'] = presort clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 0.015)
def test_regression_synthetic(): # Test on synthetic regression datasets used in Leo Breiman, # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). random_state = check_random_state(1) regression_params = {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.1, 'loss': 'ls'} # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: clf = GradientBoostingRegressor(presort=presort) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 5.0) # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: regression_params['presort'] = presort clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 1700.0) # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] for presort in True, False: regression_params['presort'] = presort clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert_less(mse, 0.015)
def test_regression_synthetic(): # Test on synthetic regression datasets used in Leo Breiman, # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). random_state = check_random_state(1) regression_params = { "n_estimators": 100, "max_depth": 4, "min_samples_split": 2, "learning_rate": 0.1, "loss": "squared_error", } # Friedman1 X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor() clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 5.0 # Friedman2 X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 1700.0 # Friedman3 X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingRegressor(**regression_params) clf.fit(X_train, y_train) mse = mean_squared_error(y_test, clf.predict(X_test)) assert mse < 0.015
def uniform_dataset(args): X = np.random.random(size=(args.num_examples, args.num_features)) y = np.random.choice([-1, 1], size=args.num_examples) return (X, y) DATASETS = { "uniform": uniform_dataset, "hastie": lambda args: datasets.make_hastie_10_2( n_samples=args.num_examples), "friedman1": lambda args: datasets.make_friedman1( n_samples=args.num_examples, n_features=args.num_features), "friedman2": lambda args: datasets.make_friedman2( n_samples=args.num_examples, noise=args.noise), "friedman3": lambda args: datasets.make_friedman3( n_samples=args.num_examples, noise=args.noise), "make_regression": lambda args: datasets.make_regression( n_samples=args.num_examples, n_features=args.num_features, n_informative=args.num_informative) } ENSEMBLE_REGRESSORS = [ ("GB-D1", with_depth(ensemble.GradientBoostingRegressor, 1)), ("GB-D3", with_depth(ensemble.GradientBoostingRegressor, 3)), ("GB-B10", with_best_first(ensemble.GradientBoostingRegressor, 10)), ("RF-D1", with_depth(ensemble.RandomForestRegressor, 1)), ("RF-D3", with_depth(ensemble.RandomForestRegressor, 3)), ("RF-D5", with_depth(ensemble.RandomForestRegressor, 5)), ]
resutl.fit(X) # In[15]: resutl.labels_ # In[17]: from sklearn.datasets import make_friedman3 # In[26]: X, y = make_friedman3(n_samples=100, noise=0.0, random_state=0) #print(X) #print(y) results = KMeans(n_clusters=5, init='random') results.fit(X) # In[27]: results.labels_ # In[ ]:
def friedman3(n_samples=20000): """ Generated data """ (data, target) = datasets.make_friedman3(n_samples=n_samples) return DatasetFactory.Dataset(data=data, target=target)
return (X, y) DATASETS = { "uniform": uniform_dataset, "hastie": lambda args: datasets.make_hastie_10_2(n_samples=args.num_examples), "friedman1": lambda args: datasets.make_friedman1(n_samples=args.num_examples, n_features=args.num_features), "friedman2": lambda args: datasets.make_friedman2(n_samples=args.num_examples, noise=args.noise), "friedman3": lambda args: datasets.make_friedman3(n_samples=args.num_examples, noise=args.noise), "make_regression": lambda args: datasets.make_regression(n_samples=args.num_examples, n_features=args.num_features, n_informative=args.num_informative) } ENSEMBLE_REGRESSORS = [ ("GB-D1", with_depth(ensemble.GradientBoostingRegressor, 1)), ("GB-D3", with_depth(ensemble.GradientBoostingRegressor, 3)), ("GB-B10", with_best_first(ensemble.GradientBoostingRegressor, 10)), ("RF-D1", with_depth(ensemble.RandomForestRegressor, 1)), ("RF-D3", with_depth(ensemble.RandomForestRegressor, 3)), ("RF-D5", with_depth(ensemble.RandomForestRegressor, 5)), ]
dataset_x1 = x dataset_y1 = y ax = fig.add_subplot(131, projection='3d') my_scatter_plot = ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y, cmap=color_map) fig.colorbar(my_scatter_plot) plt.title('make_friedman1') x, y = dt.make_friedman2(n_samples=1000, random_state=rand_state) dataset_x2 = x dataset_y2 = y ax = fig.add_subplot(132, projection='3d') my_scatter_plot = ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y, cmap=color_map) fig.colorbar(my_scatter_plot) plt.title('make_friedman2') x, y = dt.make_friedman3(n_samples=1000, random_state=rand_state) dataset_x3 = x dataset_y3 = y ax = fig.add_subplot(133, projection='3d') my_scatter_plot = ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y, cmap=color_map) fig.colorbar(my_scatter_plot) plt.suptitle('make_friedman?() for Non-Linear Data', fontsize=20) plt.title('make_friedman3') # df_x1 = pd.DataFrame(dataset_x1, columns=['x0','x1','x2','x3','x4']) # print(df_x1) # df_y1 = pd.DataFrame(dataset_y1, columns=['y1']) # print(df_y1) # df_x2 = pd.DataFrame(dataset_x2, columns=['x0','x1','x2','x3']) # print(df_x2) # df_y2 = pd.DataFrame(dataset_y2,columns=['y2']) # print(df_y2)
import gpboost as gpb import numpy as np import sklearn.datasets as datasets import time import pandas as pd print("It is recommended that the examples are run in interactive mode") # --------------------Simulate data---------------- ntrain = 5000 # number of samples for training n = 2 * ntrain # combined number of training and test data m = 500 # number of categories / levels for grouping variable sigma2_1 = 1 # random effect variance sigma2 = 1 ** 2 # error variance # Simulate non-linear mean function np.random.seed(1) X, F = datasets.make_friedman3(n_samples=n) X = pd.DataFrame(X,columns=['variable_1','variable_2','variable_3','variable_4']) F = F * 10**0.5 # with this choice, the fixed-effects regression function has the same variance as the random effects # Simulate random effects group_train = np.arange(ntrain) # grouping variable for i in range(m): group_train[int(i * ntrain / m):int((i + 1) * ntrain / m)] = i group_test = np.arange(ntrain) # grouping variable for test data. Some existing and some new groups m_test = 2 * m for i in range(m_test): group_test[int(i * ntrain / m_test):int((i + 1) * ntrain / m_test)] = i group = np.concatenate((group_train,group_test)) b = np.sqrt(sigma2_1) * np.random.normal(size=m_test) # simulate random effects Zb = b[group] # Put everything together xi = np.sqrt(sigma2) * np.random.normal(size=n) # simulate error term
best_gamma = g min_score = score print('min score: ', min_score, 'for gamma: ', best_gamma) return (best_gamma, min_score) # ---------------------------------- RUN section --------------------------------- # Settings N = 240 iterations = 1000 sigma2 = 0.01**2 alpha_start = np.ones(N + 1) # note a0 # Test data test_X, test_T = datasets.make_friedman3(n_samples=1000) gamma_list = [] rms_error_list = [] sup_vec_list = [] for iter in range(20): print('data set: ', iter + 1) # Training data X, T = datasets.make_friedman3(n_samples=240) for i in range(len(T)): T[i] = T[i] + np.random.normal(scale=(T[i] / 3)) #X, T = datasets.make_friedman1(n_samples=240, noise = 1.0) kf = KFold(n_splits=5, shuffle=True)
def prep_data_sklearn(dataset_name, test_size=0.2, model_class='realkd', downsample_size=None, norm_mean=False, random_seed=None, pos_class=None): target_name, without = dataset_signature(dataset_name) if dataset_name == 'tic-tac-toe': bunch = ds.fetch_openml(dataset_name) df = pd.DataFrame(bunch.data, columns=bunch.feature_names) df.rename(lambda s: s[:-7], axis='columns', inplace=True) df.replace(0, 'b', inplace=True) df.replace(1, 'o', inplace=True) df.replace(2, 'x', inplace=True) data_rf = pd.get_dummies(df) target = pd.Series(where(bunch.target == 'positive', 1, -1)) elif dataset_name == 'kr - vs - kp': bunch = ds.fetch_openml(data_id=3) df = pd.DataFrame(bunch.data, columns=bunch.feature_names) data_rf = pd.get_dummies(df) target = pd.Series(where(bunch.target == 'won', 1, -1)) elif dataset_name == 'breast_cancer': bunch = ds.load_breast_cancer() df = pd.DataFrame(bunch.data, columns=bunch.feature_names) data_rf = pd.get_dummies(df) target = pd.Series(where(bunch.target == 1, 1, -1)) elif dataset_name == 'iris': bunch = ds.load_iris() df = pd.DataFrame(bunch.data, columns=bunch.feature_names) data_rf = pd.get_dummies(df) target = pd.Series(where(bunch.target == 1, 1, -1)) elif dataset_name == 'make_friedman1': global_friedman_cols =10 data, target = ds.make_friedman1(n_samples=2000, n_features=10, noise=0.1, random_state=random_seed) # 1 no_cols = np.size(data, 1) col_names = ['X' + str(i+1) for i in range(no_cols)] data_rf = pd.DataFrame(data, columns=col_names) target = pd.Series(target) elif dataset_name == 'make_friedman2': data, target = ds.make_friedman2(n_samples=2000, noise=0.1, random_state=random_seed) # 1 no_cols = np.size(data, 1) col_names = ['X' + str(i+1) for i in range(no_cols)] data_rf = pd.DataFrame(data, columns=col_names) target = pd.Series(target) elif dataset_name == 'make_friedman3': data, target = ds.make_friedman3(n_samples=2000, noise=0.1, random_state=random_seed) no_cols = np.size(data, 1) col_names = ['X' + str(i+1) for i in range(no_cols)] data_rf = pd.DataFrame(data, columns=col_names) target = pd.Series(target) elif dataset_name == 'make_classification2': data, target = ds.make_classification(n_samples=2000, n_features=8, n_classes=2, hypercube=True, n_clusters_per_class=3, n_informative=3, n_redundant=3, n_repeated=0, random_state=random_seed) no_cols = np.size(data, 1) col_names = ['X' + str(i+1) for i in range(no_cols)] data_rf = pd.DataFrame(data, columns=col_names) target = pd.Series(where(target == 1, 1, -1)) elif dataset_name == 'make_classification3': data, target = ds.make_classification(n_samples=2000, n_features=15, n_classes=3, hypercube=True, n_clusters_per_class=3, n_informative=5, n_redundant=5, n_repeated=0, random_state=random_seed) no_cols = np.size(data, 1) col_names = ['X' + str(i + 1) for i in range(no_cols)] data_rf = pd.DataFrame(data, columns=col_names) target = pd.Series(where(target == 1, 1, -1)) elif dataset_name == 'load_wine': bunch = ds.load_wine() df = pd.DataFrame(bunch.data, columns=bunch.feature_names) data_rf = pd.get_dummies(df) target = pd.Series(where(bunch.target == 1, 1, -1)) elif dataset_name == 'make_hastie_10_2': data, target = ds.make_hastie_10_2(n_samples=12000, random_state=random_seed) no_cols = np.size(data, 1) col_names = ['X' + str(i+1) for i in range(no_cols)] data_rf = pd.DataFrame(data, columns=col_names) target = pd.Series(where(target == 1, 1, -1)) elif dataset_name == 'load_diabetes': bunch = ds.load_diabetes() df = pd.DataFrame(bunch.data, columns=bunch.feature_names) data_rf = pd.get_dummies(df) target = pd.Series(bunch.target) elif dataset_name[:-1] == 'noisy_pairity_': d = int(dataset_name[-1]) data, target_name, random_seed = prep_noisy_pairity(d=d, random_seed=random_seed) return data, target_name, random_seed elif dataset_name == 'digits5': data_rf, target = prep_digits() x_train, x_test, y_train, y_test = train_test_split(data_rf, target, test_size=test_size, random_state=random_seed) if downsample_size != None: x_train[target_name] = y_train sampled_train = x_train.sample(n=min(downsample_size, len(y_train)), random_state=random_seed) x_train.reset_index(inplace=True, drop=True) # this may be unncessesary y_train = sampled_train[target_name] x_train = sampled_train.drop([target_name], axis='columns') if norm_mean: # scikitlearn transformer. target_train_mean = sum(y_train) / len(y_train) y_train -= target_train_mean y_test -= target_train_mean y_train = [y_train, target_train_mean] y_test = [y_test, target_train_mean] data = [x_train, y_train, x_test, y_test] n = (len(y_train), len(y_test)) return data, target_name, random_seed
from sklearn import datasets import matplotlib.pyplot as plt # make_friedman3 data X, y = datasets.make_friedman3(n_samples=100, noise=0.0, random_state=None) print(X) print(y)
print("the output of make_blobs() :: ", datasets.make_blobs()) #make_circles() executed print("the output of make_circles() :: ", datasets.make_circles()) #make_classification() executed print("the output of make_classification() :: ", datasets.make_classification()) #make_friedman1() executed print("the output of make_friedman1() :: ", datasets.make_friedman1()) #make_friedman2() executed print("the output of make_friedman2() :: ", datasets.make_friedman2()) #make_friedman3() executed print("the output of make_friedman3() :: ", datasets.make_friedman3()) #make_gaussian_quantiles() executed print("the output of make_gaussian_quantiles() :: ", datasets.make_gaussian_quantiles()) #make_hastie_10_2() executed print("the output of make_hastie_10_2() :: ", datasets.make_hastie_10_2()) #make_moons() executed print("the output of make_moons() :: ", datasets.make_moons()) #make_multilabel_classification() executed print("the output of make_multilabel_classification() :: ", datasets.make_multilabel_classification())
def getSKData(style='timeseries', as_dataframe=False, n_samples=10, **kwargs): if style == 'regression': return make_regression(n_samples, kwargs.get('n_features', RegressionArgs.n_features), kwargs.get('n_informative', RegressionArgs.n_informative), kwargs.get('n_targets', RegressionArgs.n_targets), kwargs.get('bias', RegressionArgs.bias), kwargs.get('effective_rank', RegressionArgs.effective_rank), kwargs.get('tail_strength', RegressionArgs.tail_strength), kwargs.get('noise', RegressionArgs.noise), kwargs.get('shuffle', RegressionArgs.shuffle), kwargs.get('coef', RegressionArgs.coef), kwargs.get('random_state', RegressionArgs.random_state)) elif style == 'blobs': return make_blobs(n_samples, kwargs.get('n_features', BlobsArgs.n_features), kwargs.get('centers', BlobsArgs.centers), kwargs.get('cluster_std', BlobsArgs.cluster_std), kwargs.get('center_box', BlobsArgs.center_box), kwargs.get('shuffle', BlobsArgs.shuffle), kwargs.get('random_state', BlobsArgs.random_state)) elif style == 'classification': return make_classification(n_samples, kwargs.get('n_features', ClassificationArgs.n_features), kwargs.get('n_informative', ClassificationArgs.n_informative), kwargs.get('n_redundant', ClassificationArgs.n_redundant), kwargs.get('n_repeated', ClassificationArgs.n_repeated), kwargs.get('n_classes', ClassificationArgs.n_classes), kwargs.get('n_clusters_per_class', ClassificationArgs.n_clusters_per_class), kwargs.get('weights', ClassificationArgs.weights), kwargs.get('flip_y', ClassificationArgs.flip_y), kwargs.get('class_sep', ClassificationArgs.class_sep), kwargs.get('hypercube', ClassificationArgs.hypercube), kwargs.get('shift', ClassificationArgs.shift), kwargs.get('scale', ClassificationArgs.scale), kwargs.get('shuffle', ClassificationArgs.shuffle), kwargs.get('random_state', ClassificationArgs.random_state)) elif style == 'multilabel': return make_multilabel_classification(n_samples, kwargs.get('n_features', MultilabelClassificationArgs.n_features), kwargs.get('n_classes', MultilabelClassificationArgs.n_classes), kwargs.get('n_labels', MultilabelClassificationArgs.n_labels), kwargs.get('length', MultilabelClassificationArgs.length), kwargs.get('allow_unlabeled', MultilabelClassificationArgs.allow_unlabeled), kwargs.get('sparse', MultilabelClassificationArgs.sparse), kwargs.get('return_indicator', MultilabelClassificationArgs.return_indicator), kwargs.get('return_distributions', MultilabelClassificationArgs.return_distributions), kwargs.get('random_state', MultilabelClassificationArgs.random_state)) elif style == 'gaussian': return make_gaussian_quantiles(n_samples=n_samples, n_features=kwargs.get('n_features', GaussianArgs.n_features), mean=kwargs.get('mean', GaussianArgs.mean), cov=kwargs.get('cov', GaussianArgs.cov), n_classes=kwargs.get('n_classes', GaussianArgs.n_classes), shuffle=kwargs.get('shuffle', GaussianArgs.shuffle), random_state=kwargs.get('random_state', GaussianArgs.random_state)) elif style == 'hastie': return make_hastie_10_2(n_samples, random_state=kwargs.get('random_state', HastieArgs.random_state)) elif style == 'circles': return make_circles(n_samples, kwargs.get('shuffle', CirclesArgs.shuffle), kwargs.get('noise', CirclesArgs.noise), kwargs.get('random_state', CirclesArgs.random_state), kwargs.get('factor', CirclesArgs.factor)) elif style == 'moons': return make_moons(n_samples, kwargs.get('shuffle', MoonsArgs.shuffle), kwargs.get('noise', MoonsArgs.noise), kwargs.get('random_state', MoonsArgs.random_state)) elif style == 'biclusters': x = make_biclusters(kwargs.get('shape', BiclusterArgs.shape), kwargs.get('n_clusters', BiclusterArgs.n_clusters), kwargs.get('noise', BiclusterArgs.noise), kwargs.get('minval', BiclusterArgs.minval), kwargs.get('maxval', BiclusterArgs.maxval), kwargs.get('shuffle', BiclusterArgs.shuffle), kwargs.get('random_state', BiclusterArgs.random_state)) if as_dataframe: return pd.concat([pd.DataFrame(x[0]), pd.DataFrame(x[1].T)], axis=1) else: return x elif style == 'scurve': return make_s_curve(n_samples, kwargs.get('noise', SCurveArgs.noise), kwargs.get('random_state', SCurveArgs.random_state)) elif style == 'checker': return make_checkerboard(kwargs.get('shape', CheckerArgs.shape), kwargs.get('n_clusters', CheckerArgs.n_clusters), kwargs.get('noise', CheckerArgs.noise), kwargs.get('minval', CheckerArgs.minval), kwargs.get('maxval', CheckerArgs.maxval), kwargs.get('shuffle', CheckerArgs.shuffle), kwargs.get('random_state', CheckerArgs.random_state)) elif style == 'friedman': return make_friedman1(n_samples, kwargs.get('n_features', FriedmanArgs.n_features), kwargs.get('noise', FriedmanArgs.noise), kwargs.get('random_state', FriedmanArgs.random_state)) elif style == 'friedman2': return make_friedman2(n_samples, kwargs.get('noise', Friedman2Args.noise), kwargs.get('random_state', Friedman2Args.random_state)) elif style == 'friedman3': return make_friedman3(n_samples, kwargs.get('noise', Friedman3Args.noise), kwargs.get('random_state', Friedman3Args.random_state))
def getSKData(style='timeseries', n_samples=1, **kwargs): if isinstance(style, str): style = Style(style.lower()) if style == Style.REGRESSION: return make_regression( n_samples, kwargs.get('n_features', RegressionArgs.n_features), kwargs.get('n_informative', RegressionArgs.n_informative), kwargs.get('n_targets', RegressionArgs.n_targets), kwargs.get('bias', RegressionArgs.bias), kwargs.get('effective_rank', RegressionArgs.effective_rank), kwargs.get('tail_strength', RegressionArgs.tail_strength), kwargs.get('noise', RegressionArgs.noise), kwargs.get('shuffle', RegressionArgs.shuffle), kwargs.get('coef', RegressionArgs.coef), kwargs.get('random_state', RegressionArgs.random_state)) elif style == Style.BLOBS: return make_blobs(n_samples, kwargs.get('n_features', BlobsArgs.n_features), kwargs.get('centers', BlobsArgs.centers), kwargs.get('cluster_std', BlobsArgs.cluster_std), kwargs.get('center_box', BlobsArgs.center_box), kwargs.get('shuffle', BlobsArgs.shuffle), kwargs.get('random_state', BlobsArgs.random_state)) elif style == Style.CLASSIFICATION: return make_classification( n_samples, kwargs.get('n_features', ClassificationArgs.n_features), kwargs.get('n_informative', ClassificationArgs.n_informative), kwargs.get('n_redundant', ClassificationArgs.n_redundant), kwargs.get('n_repeated', ClassificationArgs.n_repeated), kwargs.get('n_classes', ClassificationArgs.n_classes), kwargs.get('n_clusters_per_class', ClassificationArgs.n_clusters_per_class), kwargs.get('weights', ClassificationArgs.weights), kwargs.get('flip_y', ClassificationArgs.flip_y), kwargs.get('class_sep', ClassificationArgs.class_sep), kwargs.get('hypercube', ClassificationArgs.hypercube), kwargs.get('shift', ClassificationArgs.shift), kwargs.get('scale', ClassificationArgs.scale), kwargs.get('shuffle', ClassificationArgs.shuffle), kwargs.get('random_state', ClassificationArgs.random_state)) elif style == Style.MULTILABEL: return make_multilabel_classification( n_samples, kwargs.get('n_features', MultilabelClassificationArgs.n_features), kwargs.get('n_classes', MultilabelClassificationArgs.n_classes), kwargs.get('n_labels', MultilabelClassificationArgs.n_labels), kwargs.get('length', MultilabelClassificationArgs.length), kwargs.get('allow_unlabeled', MultilabelClassificationArgs.allow_unlabeled), kwargs.get('sparse', MultilabelClassificationArgs.sparse), kwargs.get('return_indicator', MultilabelClassificationArgs.return_indicator), kwargs.get('return_distributions', MultilabelClassificationArgs.return_distributions), kwargs.get('random_state', MultilabelClassificationArgs.random_state)) elif style == Style.GAUSSIAN: return make_gaussian_quantiles( n_samples=n_samples, n_features=kwargs.get('n_features', GaussianArgs.n_features), mean=kwargs.get('mean', GaussianArgs.mean), cov=kwargs.get('cov', GaussianArgs.cov), n_classes=kwargs.get('n_classes', GaussianArgs.n_classes), shuffle=kwargs.get('shuffle', GaussianArgs.shuffle), random_state=kwargs.get('random_state', GaussianArgs.random_state)) elif style == Style.HASTIE: return make_hastie_10_2(n_samples, random_state=kwargs.get( 'random_state', HastieArgs.random_state)) elif style == Style.CIRCLES: return make_circles( n_samples, kwargs.get('shuffle', CirclesArgs.shuffle), kwargs.get('noise', CirclesArgs.noise), kwargs.get('random_state', CirclesArgs.random_state), kwargs.get('factor', CirclesArgs.factor)) elif style == Style.MOONS: return make_moons(n_samples, kwargs.get('shuffle', MoonsArgs.shuffle), kwargs.get('noise', MoonsArgs.noise), kwargs.get('random_state', MoonsArgs.random_state)) elif style == Style.BICLUSTERS: return make_biclusters( kwargs.get('shape', BiclusterArgs.shape), kwargs.get('n_clusters', BiclusterArgs.n_clusters), kwargs.get('noise', BiclusterArgs.noise), kwargs.get('minval', BiclusterArgs.minval), kwargs.get('maxval', BiclusterArgs.maxval), kwargs.get('shuffle', BiclusterArgs.shuffle), kwargs.get('random_state', BiclusterArgs.random_state)) elif style == Style.SCURVE: return make_s_curve( n_samples, kwargs.get('noise', SCurveArgs.noise), kwargs.get('random_state', SCurveArgs.random_state)) elif style == Style.CHECKER: return make_checkerboard( kwargs.get('shape', CheckerArgs.shape), kwargs.get('n_clusters', CheckerArgs.n_clusters), kwargs.get('noise', CheckerArgs.noise), kwargs.get('minval', CheckerArgs.minval), kwargs.get('maxval', CheckerArgs.maxval), kwargs.get('shuffle', CheckerArgs.shuffle), kwargs.get('random_state', CheckerArgs.random_state)) elif style == Style.FRIEDMAN: return make_friedman1( n_samples, kwargs.get('n_features', FriedmanArgs.n_features), kwargs.get('noise', FriedmanArgs.noise), kwargs.get('random_state', FriedmanArgs.random_state)) elif style == Style.FRIEDMAN2: return make_friedman2( n_samples, kwargs.get('noise', Friedman2Args.noise), kwargs.get('random_state', Friedman2Args.random_state)) elif style == Style.FRIEDMAN3: return make_friedman3( n_samples, kwargs.get('noise', Friedman3Args.noise), kwargs.get('random_state', Friedman3Args.random_state))
Created on Wed Apr 17 18:50:49 2019 author: Fabio Sigrist """ import sklearn.datasets as datasets import numpy as np import KTBoost.KTBoost as KTBoost import random """ Example 1 """ # simulate data random.seed(10) n = 1000 X, lp = datasets.make_friedman3(n_samples=n) X_test, y_test = datasets.make_friedman3(n_samples=n) lp = lp * 5 + 0.2 y_test = y_test * 5 + 0.2 y = np.random.normal(loc=lp, scale=1) # apply censoring yu = 8 yl = 5 y[y >= yu] = yu y[y <= yl] = yl # train model and make predictions model = KTBoost.BoostingRegressor(loss='tobit', yl=yl, yu=yu).fit(X, y) y_pred = model.predict(X_test) # mean square error (approx. 0.44 for n=1000) print("Test error Grabit: " + str(((y_pred - y_test)**2).mean()))