Beispiel #1
0
def FriedmanDataset_3():
    d_train = datasets.make_friedman3(240, random_state=0)
    d_test = datasets.make_friedman3(1000, random_state=0)

    features_train = d_train[0]
    for i in range(240):
        features_train[i] += np.random.normal(0, features_train[i] / 3)

    target_train = d_train[1]
    features_test = d_test[0]
    target_test = d_test[1]
    return features_train, target_train, features_test, target_test
def test_make_friedman3():
    X, y = make_friedman3(n_samples=5, noise=0.0, random_state=0)

    assert_equal(X.shape, (5, 4), "X shape mismatch")
    assert_equal(y.shape, (5,), "y shape mismatch")

    assert_array_almost_equal(y, np.arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]))
def test_regression_synthetic():
    """Test on synthetic regression datasets used in Leo Breiman,
    `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """
    random_state = check_random_state(1)
    regression_params = {'n_estimators': 100, 'max_depth': 4,
                         'min_samples_split': 1, 'learning_rate': 0.1,
                         'loss': 'ls'}

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor()
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse
def test_regression_synthetic():
    """Test on synthetic regression datasets used in Leo Breiman,
    `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """
    random_state = check_random_state(1)
    regression_params = {'n_estimators': 100, 'max_depth': 4,
                         'min_samples_split': 1, 'learning_rate': 0.1,
                         'loss': 'ls'}

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor()
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse
Beispiel #5
0
def friedman3(n_samples=100,
              noise=0.0,
              random_state=None):

    return datasets.make_friedman3(n_samples=n_samples,
                                   noise=noise,
                                   random_state=random_state)
Beispiel #6
0
def test_make_friedman3():
    X, y = make_friedman3(n_samples=5, noise=0.0, random_state=0)

    assert X.shape == (5, 4), "X shape mismatch"
    assert y.shape == (5, ), "y shape mismatch"

    assert_array_almost_equal(
        y, np.arctan((X[:, 1] * X[:, 2] - 1 / (X[:, 1] * X[:, 3])) / X[:, 0]))
Beispiel #7
0
def friedman_3(n, noise):
    """Generate the friedman_3 data set

    Args:
    n (int): number of data samples
    noise (float): added noise

    Returns:
    Friedman 3 data set

    """
    return make_friedman3(n_samples=n, noise=noise)
Beispiel #8
0
def define_tested_reg_datasets():

    gDatasets = {};
    gDatasets["diabetes"] = datasets.load_diabetes()
    gDatasets["boston"] = datasets.load_boston()
    gDatasets["freidman1"] = datasets.make_friedman1(random_state=1960)
    gDatasets["freidman2"] = datasets.make_friedman2(random_state=1960)
    gDatasets["freidman3"] = datasets.make_friedman3(random_state=1960)
    gDatasets["RandomReg_10"] = datasets.make_regression(n_features=10, random_state=1960);
    gDatasets["RandomReg_100"] = datasets.make_regression(n_features=100, random_state=1960);
    gDatasets["RandomReg_500"] = datasets.make_regression(n_features=500, random_state=1960);

    return gDatasets;
Beispiel #9
0
def generateDatas(N, F, choice):

    if (choice == 'f1'):
        X, Y = datasets.make_friedman1(N, F, noise=1)
    elif (choice == 'f2'):
        X, Y = datasets.make_friedman2(N, F)
    elif (choice == 'f3'):
        X, Y == datasets.make_friedman3(N, F)
    elif (choice == 'boston'):
        boston = datasets.load_boston()
        X, Y = boston.data, boston.target

    return X, Y
Beispiel #10
0
    def genFriedman(self, i=1, N=240, D=10):
        if i not in range(1, 4):
            raise Exception('not a correct dataset')

        if i == 1:
            X, Y = datasets.make_friedman1(N, D)

        if i == 2:
            X, Y = datasets.make_friedman2(N, D)

        if i == 3:
            X, Y = datasets.make_friedman3(N, D)
        return X, Y
Beispiel #11
0
    def genFriedman(self, i=1, N=240, D=10):
        if i not in range(1,4):
            raise Exception('not a correct dataset')

        if i == 1:
            X, Y = datasets.make_friedman1(N, D )

        if i == 2:
            X, Y = datasets.make_friedman2(N, D)

        if i == 3:
            X, Y = datasets.make_friedman3(N, D)
        return X, Y
def make_data(n_samples=1000, n_features=1, n_targets=1, informative_prop=1.0,
              noise=0.0, test_prop=0.1, valid_prop=0.3, method='linear'):
    if method == 'linear':
        params = dict(n_features=n_features,
                      n_informative=int(n_features*informative_prop),
                      noise=noise,
                      n_targets=n_targets,
                      n_samples=n_samples,
                      shuffle=False,
                      bias=0.0)
        X, Y = make_regression(**params)
    elif method == 'boston':
        boston = load_boston()
        X = boston.data
        Y = boston.target
    else:
        params = dict(n_samples=n_samples,
                      n_features=n_features)
        X, Y = make_friedman3(n_samples=n_samples, n_features=n_features,
                                 noise=noise)

    X = MinMaxScaler(feature_range=(0.0,1.0)).fit_transform(X)
    X = X.astype(theano.config.floatX)
    Y = MinMaxScaler(feature_range=(0.0,1.0)).fit_transform(Y)
    Y = Y.astype(theano.config.floatX)
    if len(X.shape) > 1:
        n_features = X.shape[1]
    else:
        X = X.reshape(X.shape[0], -1)
        n_features = 1
    if len(Y.shape) > 1:
        n_targets = Y.shape[1]
    else:
        Y = Y.reshape(Y.shape[0], -1)
        n_targets = 1

    X_train, Y_train, X_valid, Y_valid, X_test, Y_test = \
        train_valid_test_split(X, Y,
                               test_prop=valid_prop, valid_prop=valid_prop)
    return dict(
        X_train=theano.shared(X_train),
        Y_train=theano.shared(Y_train),
        X_valid=theano.shared(X_valid),
        Y_valid=theano.shared(Y_valid),
        X_test=theano.shared(X_test),
        Y_test=theano.shared(Y_test),
        num_examples_train=X_train.shape[0],
        num_examples_valid=X_valid.shape[0],
        num_examples_test=X_test.shape[0],
        input_dim=n_features,
        output_dim=n_targets)
def main():
    dir = sys.argv[1]
    output_csv = dir + '/friedman3/friedman3_prep.csv'
    names = ["x1", "x2", "x3", "x4", "y"]

    (X, y) = data.make_friedman3(n_samples=10000,
                                 random_state=123456,
                                 noise=0.01)
    y = np.matrix(y).T
    df = pd.DataFrame(np.append(X, y, axis=1), columns=names)
    df = scale(df)[1]

    # TODO Transform box-cox.
    df.to_csv(output_csv, index=False)
Beispiel #14
0
def generate_Dataset(name_dataset):
    # default 
    ind = f(name_dataset)
    if ind == 1:
        x,y = datasets.make_friedman1(n_samples=100, n_features=10, noise=0.0, random_state=None)
    elif ind == 2:
        x,y = datasets.make_friedman2(n_samples=100, noise=0.0, random_state=None)
    elif ind == 3:
        x,y = datasets.make_friedman3(n_samples=100, noise=0.0, random_state=None)
    else :
        x, y = datasets.load_boston(return_X_y=True)
     
    x = x.tolist()
    return x,y
Beispiel #15
0
def test_regression_synthetic():
    # Test on synthetic regression datasets used in Leo Breiman,
    # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996).
    random_state = check_random_state(1)
    regression_params = {
        'n_estimators': 100,
        'max_depth': 4,
        'min_samples_split': 2,
        'learning_rate': 0.1,
        'loss': 'ls'
    }

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state,
                                   noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        clf = GradientBoostingRegressor(presort=presort)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 5.0)

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        regression_params['presort'] = presort
        clf = GradientBoostingRegressor(**regression_params)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 1700.0)

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        regression_params['presort'] = presort
        clf = GradientBoostingRegressor(**regression_params)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 0.015)
def test_regression_synthetic():
    # Test on synthetic regression datasets used in Leo Breiman,
    # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996).
    random_state = check_random_state(1)
    regression_params = {'n_estimators': 100, 'max_depth': 4,
                         'min_samples_split': 2, 'learning_rate': 0.1,
                         'loss': 'ls'}

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state,
                                   noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        clf = GradientBoostingRegressor(presort=presort)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 5.0)

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        regression_params['presort'] = presort
        clf = GradientBoostingRegressor(**regression_params)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 1700.0)

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        regression_params['presort'] = presort
        clf = GradientBoostingRegressor(**regression_params)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 0.015)
def test_regression_synthetic():
    # Test on synthetic regression datasets used in Leo Breiman,
    # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996).
    random_state = check_random_state(1)
    regression_params = {
        "n_estimators": 100,
        "max_depth": 4,
        "min_samples_split": 2,
        "learning_rate": 0.1,
        "loss": "squared_error",
    }

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200, random_state=random_state, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    clf = GradientBoostingRegressor()
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 5.0

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 1700.0

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 0.015

def uniform_dataset(args):
    X = np.random.random(size=(args.num_examples, args.num_features))
    y = np.random.choice([-1, 1], size=args.num_examples)
    return (X, y)

DATASETS = {
    "uniform": uniform_dataset,
    "hastie": lambda args: datasets.make_hastie_10_2(
        n_samples=args.num_examples),
    "friedman1": lambda args: datasets.make_friedman1(
        n_samples=args.num_examples, n_features=args.num_features),
    "friedman2": lambda args: datasets.make_friedman2(
        n_samples=args.num_examples, noise=args.noise),
    "friedman3": lambda args: datasets.make_friedman3(
        n_samples=args.num_examples, noise=args.noise),
    "make_regression": lambda args: datasets.make_regression(
        n_samples=args.num_examples,
        n_features=args.num_features,
        n_informative=args.num_informative)
}

ENSEMBLE_REGRESSORS = [
    ("GB-D1", with_depth(ensemble.GradientBoostingRegressor, 1)),
    ("GB-D3", with_depth(ensemble.GradientBoostingRegressor, 3)),
    ("GB-B10", with_best_first(ensemble.GradientBoostingRegressor, 10)),
    ("RF-D1", with_depth(ensemble.RandomForestRegressor, 1)),
    ("RF-D3", with_depth(ensemble.RandomForestRegressor, 3)),
    ("RF-D5", with_depth(ensemble.RandomForestRegressor, 5)),
]
Beispiel #19
0
resutl.fit(X)


# In[15]:

resutl.labels_


# In[17]:

from sklearn.datasets import make_friedman3


# In[26]:

X, y = make_friedman3(n_samples=100, noise=0.0, random_state=0)
#print(X)
#print(y)
results = KMeans(n_clusters=5, init='random')
results.fit(X)


# In[27]:

results.labels_


# In[ ]:


Beispiel #20
0
 def friedman3(n_samples=20000):
     """ Generated data """
     (data, target) = datasets.make_friedman3(n_samples=n_samples)
     return DatasetFactory.Dataset(data=data, target=target)
Beispiel #21
0
    return (X, y)


DATASETS = {
    "uniform":
    uniform_dataset,
    "hastie":
    lambda args: datasets.make_hastie_10_2(n_samples=args.num_examples),
    "friedman1":
    lambda args: datasets.make_friedman1(n_samples=args.num_examples,
                                         n_features=args.num_features),
    "friedman2":
    lambda args: datasets.make_friedman2(n_samples=args.num_examples,
                                         noise=args.noise),
    "friedman3":
    lambda args: datasets.make_friedman3(n_samples=args.num_examples,
                                         noise=args.noise),
    "make_regression":
    lambda args: datasets.make_regression(n_samples=args.num_examples,
                                          n_features=args.num_features,
                                          n_informative=args.num_informative)
}

ENSEMBLE_REGRESSORS = [
    ("GB-D1", with_depth(ensemble.GradientBoostingRegressor, 1)),
    ("GB-D3", with_depth(ensemble.GradientBoostingRegressor, 3)),
    ("GB-B10", with_best_first(ensemble.GradientBoostingRegressor, 10)),
    ("RF-D1", with_depth(ensemble.RandomForestRegressor, 1)),
    ("RF-D3", with_depth(ensemble.RandomForestRegressor, 3)),
    ("RF-D5", with_depth(ensemble.RandomForestRegressor, 5)),
]
Beispiel #22
0
dataset_x1 = x
dataset_y1 = y
ax = fig.add_subplot(131, projection='3d')
my_scatter_plot = ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y, cmap=color_map)
fig.colorbar(my_scatter_plot)
plt.title('make_friedman1')

x, y = dt.make_friedman2(n_samples=1000, random_state=rand_state)
dataset_x2 = x
dataset_y2 = y
ax = fig.add_subplot(132, projection='3d')
my_scatter_plot = ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y, cmap=color_map)
fig.colorbar(my_scatter_plot)
plt.title('make_friedman2')

x, y = dt.make_friedman3(n_samples=1000, random_state=rand_state)
dataset_x3 = x
dataset_y3 = y
ax = fig.add_subplot(133, projection='3d')
my_scatter_plot = ax.scatter(x[:, 0], x[:, 1], x[:, 2], c=y, cmap=color_map)
fig.colorbar(my_scatter_plot)
plt.suptitle('make_friedman?() for Non-Linear Data', fontsize=20)
plt.title('make_friedman3')
# df_x1 = pd.DataFrame(dataset_x1, columns=['x0','x1','x2','x3','x4'])
# print(df_x1)
# df_y1 = pd.DataFrame(dataset_y1, columns=['y1'])
# print(df_y1)
# df_x2 = pd.DataFrame(dataset_x2, columns=['x0','x1','x2','x3'])
# print(df_x2)
# df_y2 = pd.DataFrame(dataset_y2,columns=['y2'])
# print(df_y2)
import gpboost as gpb
import numpy as np
import sklearn.datasets as datasets
import time
import pandas as pd
print("It is recommended that the examples are run in interactive mode")

# --------------------Simulate data----------------
ntrain = 5000 # number of samples for training
n = 2 * ntrain # combined number of training and test data
m = 500  # number of categories / levels for grouping variable
sigma2_1 = 1  # random effect variance
sigma2 = 1 ** 2  # error variance
# Simulate non-linear mean function
np.random.seed(1)
X, F = datasets.make_friedman3(n_samples=n)
X = pd.DataFrame(X,columns=['variable_1','variable_2','variable_3','variable_4'])
F = F * 10**0.5 # with this choice, the fixed-effects regression function has the same variance as the random effects
# Simulate random effects
group_train = np.arange(ntrain)  # grouping variable
for i in range(m):
    group_train[int(i * ntrain / m):int((i + 1) * ntrain / m)] = i
group_test = np.arange(ntrain) # grouping variable for test data. Some existing and some new groups
m_test = 2 * m
for i in range(m_test):
    group_test[int(i * ntrain / m_test):int((i + 1) * ntrain / m_test)] = i
group = np.concatenate((group_train,group_test))
b = np.sqrt(sigma2_1) * np.random.normal(size=m_test)  # simulate random effects
Zb = b[group]
# Put everything together
xi = np.sqrt(sigma2) * np.random.normal(size=n)  # simulate error term
Beispiel #24
0
            best_gamma = g
            min_score = score
    print('min score: ', min_score, 'for gamma: ', best_gamma)
    return (best_gamma, min_score)


# ---------------------------------- RUN section ---------------------------------

# Settings
N = 240
iterations = 1000
sigma2 = 0.01**2
alpha_start = np.ones(N + 1)  # note a0

# Test data
test_X, test_T = datasets.make_friedman3(n_samples=1000)

gamma_list = []
rms_error_list = []
sup_vec_list = []
for iter in range(20):
    print('data set: ', iter + 1)

    # Training data
    X, T = datasets.make_friedman3(n_samples=240)
    for i in range(len(T)):
        T[i] = T[i] + np.random.normal(scale=(T[i] / 3))
    #X, T = datasets.make_friedman1(n_samples=240, noise = 1.0)

    kf = KFold(n_splits=5, shuffle=True)
def prep_data_sklearn(dataset_name, test_size=0.2, model_class='realkd', downsample_size=None, norm_mean=False,
                      random_seed=None, pos_class=None):

    target_name, without = dataset_signature(dataset_name)

    if dataset_name == 'tic-tac-toe':
        bunch = ds.fetch_openml(dataset_name)
        df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
        df.rename(lambda s: s[:-7], axis='columns', inplace=True)
        df.replace(0, 'b', inplace=True)
        df.replace(1, 'o', inplace=True)
        df.replace(2, 'x', inplace=True)
        data_rf = pd.get_dummies(df)
        target = pd.Series(where(bunch.target == 'positive', 1, -1))
    elif dataset_name == 'kr - vs - kp':
        bunch = ds.fetch_openml(data_id=3)
        df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
        data_rf = pd.get_dummies(df)
        target = pd.Series(where(bunch.target == 'won', 1, -1))
    elif dataset_name == 'breast_cancer':
        bunch = ds.load_breast_cancer()
        df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
        data_rf = pd.get_dummies(df)
        target = pd.Series(where(bunch.target == 1, 1, -1))
    elif dataset_name == 'iris':
        bunch = ds.load_iris()
        df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
        data_rf = pd.get_dummies(df)
        target = pd.Series(where(bunch.target == 1, 1, -1))
    elif dataset_name == 'make_friedman1':
        global_friedman_cols =10
        data, target = ds.make_friedman1(n_samples=2000, n_features=10, noise=0.1, random_state=random_seed) # 1
        no_cols = np.size(data, 1)
        col_names = ['X' + str(i+1) for i in range(no_cols)]
        data_rf = pd.DataFrame(data, columns=col_names)
        target = pd.Series(target)
    elif dataset_name == 'make_friedman2':
        data, target = ds.make_friedman2(n_samples=2000, noise=0.1, random_state=random_seed) # 1
        no_cols = np.size(data, 1)
        col_names = ['X' + str(i+1) for i in range(no_cols)]
        data_rf = pd.DataFrame(data, columns=col_names)
        target = pd.Series(target)
    elif dataset_name == 'make_friedman3':
        data, target = ds.make_friedman3(n_samples=2000, noise=0.1, random_state=random_seed)
        no_cols = np.size(data, 1)
        col_names = ['X' + str(i+1) for i in range(no_cols)]
        data_rf = pd.DataFrame(data, columns=col_names)
        target = pd.Series(target)
    elif dataset_name == 'make_classification2':
        data, target = ds.make_classification(n_samples=2000, n_features=8, n_classes=2,
                                              hypercube=True, n_clusters_per_class=3,
                                              n_informative=3, n_redundant=3, n_repeated=0,
                                              random_state=random_seed)
        no_cols = np.size(data, 1)
        col_names = ['X' + str(i+1) for i in range(no_cols)]
        data_rf = pd.DataFrame(data, columns=col_names)
        target = pd.Series(where(target == 1, 1, -1))
    elif dataset_name == 'make_classification3':
        data, target = ds.make_classification(n_samples=2000, n_features=15, n_classes=3,
                                              hypercube=True, n_clusters_per_class=3,
                                              n_informative=5, n_redundant=5, n_repeated=0,
                                              random_state=random_seed)
        no_cols = np.size(data, 1)
        col_names = ['X' + str(i + 1) for i in range(no_cols)]
        data_rf = pd.DataFrame(data, columns=col_names)
        target = pd.Series(where(target == 1, 1, -1))
    elif dataset_name == 'load_wine':
        bunch = ds.load_wine()
        df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
        data_rf = pd.get_dummies(df)
        target = pd.Series(where(bunch.target == 1, 1, -1))
    elif dataset_name == 'make_hastie_10_2':
        data, target = ds.make_hastie_10_2(n_samples=12000, random_state=random_seed)
        no_cols = np.size(data, 1)
        col_names = ['X' + str(i+1) for i in range(no_cols)]
        data_rf = pd.DataFrame(data, columns=col_names)
        target = pd.Series(where(target == 1, 1, -1))
    elif dataset_name == 'load_diabetes':
        bunch = ds.load_diabetes()
        df = pd.DataFrame(bunch.data, columns=bunch.feature_names)
        data_rf = pd.get_dummies(df)
        target = pd.Series(bunch.target)
    elif dataset_name[:-1] == 'noisy_pairity_':
        d = int(dataset_name[-1])
        data, target_name, random_seed = prep_noisy_pairity(d=d, random_seed=random_seed)
        return data, target_name, random_seed
    elif dataset_name == 'digits5':
        data_rf, target = prep_digits()

    x_train, x_test, y_train, y_test = train_test_split(data_rf, target, test_size=test_size, random_state=random_seed)

    if downsample_size != None:
        x_train[target_name] = y_train
        sampled_train = x_train.sample(n=min(downsample_size, len(y_train)), random_state=random_seed)
        x_train.reset_index(inplace=True, drop=True) # this may be unncessesary
        y_train = sampled_train[target_name]
        x_train = sampled_train.drop([target_name], axis='columns')

    if norm_mean:  # scikitlearn transformer.
        target_train_mean = sum(y_train) / len(y_train)
        y_train -= target_train_mean
        y_test -= target_train_mean

        y_train = [y_train, target_train_mean]
        y_test = [y_test, target_train_mean]


    data = [x_train, y_train, x_test, y_test]

    n = (len(y_train), len(y_test))

    return data, target_name, random_seed
from sklearn import datasets
import matplotlib.pyplot as plt

# make_friedman3 data
X, y = datasets.make_friedman3(n_samples=100, noise=0.0, random_state=None)
print(X)
print(y)
print("the output of make_blobs() :: ", datasets.make_blobs())

#make_circles() executed
print("the output of make_circles() :: ", datasets.make_circles())

#make_classification() executed
print("the output of make_classification() :: ",
      datasets.make_classification())
#make_friedman1() executed
print("the output of make_friedman1() :: ", datasets.make_friedman1())

#make_friedman2() executed
print("the output of make_friedman2() :: ", datasets.make_friedman2())

#make_friedman3() executed
print("the output of make_friedman3() :: ", datasets.make_friedman3())

#make_gaussian_quantiles() executed
print("the output of make_gaussian_quantiles() :: ",
      datasets.make_gaussian_quantiles())

#make_hastie_10_2() executed
print("the output of make_hastie_10_2() :: ", datasets.make_hastie_10_2())

#make_moons() executed
print("the output of make_moons() :: ", datasets.make_moons())

#make_multilabel_classification() executed
print("the output of make_multilabel_classification() :: ",
      datasets.make_multilabel_classification())
Beispiel #28
0
def getSKData(style='timeseries', as_dataframe=False, n_samples=10, **kwargs):
    if style == 'regression':
        return make_regression(n_samples,
                               kwargs.get('n_features', RegressionArgs.n_features),
                               kwargs.get('n_informative', RegressionArgs.n_informative),
                               kwargs.get('n_targets', RegressionArgs.n_targets),
                               kwargs.get('bias', RegressionArgs.bias),
                               kwargs.get('effective_rank', RegressionArgs.effective_rank),
                               kwargs.get('tail_strength', RegressionArgs.tail_strength),
                               kwargs.get('noise', RegressionArgs.noise),
                               kwargs.get('shuffle', RegressionArgs.shuffle),
                               kwargs.get('coef', RegressionArgs.coef),
                               kwargs.get('random_state', RegressionArgs.random_state))
    elif style == 'blobs':
        return make_blobs(n_samples,
                          kwargs.get('n_features', BlobsArgs.n_features),
                          kwargs.get('centers', BlobsArgs.centers),
                          kwargs.get('cluster_std', BlobsArgs.cluster_std),
                          kwargs.get('center_box', BlobsArgs.center_box),
                          kwargs.get('shuffle', BlobsArgs.shuffle),
                          kwargs.get('random_state', BlobsArgs.random_state))
    elif style == 'classification':
        return make_classification(n_samples,
                                   kwargs.get('n_features', ClassificationArgs.n_features),
                                   kwargs.get('n_informative', ClassificationArgs.n_informative),
                                   kwargs.get('n_redundant', ClassificationArgs.n_redundant),
                                   kwargs.get('n_repeated', ClassificationArgs.n_repeated),
                                   kwargs.get('n_classes', ClassificationArgs.n_classes),
                                   kwargs.get('n_clusters_per_class', ClassificationArgs.n_clusters_per_class),
                                   kwargs.get('weights', ClassificationArgs.weights),
                                   kwargs.get('flip_y', ClassificationArgs.flip_y),
                                   kwargs.get('class_sep', ClassificationArgs.class_sep),
                                   kwargs.get('hypercube', ClassificationArgs.hypercube),
                                   kwargs.get('shift', ClassificationArgs.shift),
                                   kwargs.get('scale', ClassificationArgs.scale),
                                   kwargs.get('shuffle', ClassificationArgs.shuffle),
                                   kwargs.get('random_state', ClassificationArgs.random_state))
    elif style == 'multilabel':
        return make_multilabel_classification(n_samples,
                                              kwargs.get('n_features', MultilabelClassificationArgs.n_features),
                                              kwargs.get('n_classes', MultilabelClassificationArgs.n_classes),
                                              kwargs.get('n_labels', MultilabelClassificationArgs.n_labels),
                                              kwargs.get('length', MultilabelClassificationArgs.length),
                                              kwargs.get('allow_unlabeled', MultilabelClassificationArgs.allow_unlabeled),
                                              kwargs.get('sparse', MultilabelClassificationArgs.sparse),
                                              kwargs.get('return_indicator', MultilabelClassificationArgs.return_indicator),
                                              kwargs.get('return_distributions', MultilabelClassificationArgs.return_distributions),
                                              kwargs.get('random_state', MultilabelClassificationArgs.random_state))
    elif style == 'gaussian':
        return make_gaussian_quantiles(n_samples=n_samples,
                                       n_features=kwargs.get('n_features', GaussianArgs.n_features),
                                       mean=kwargs.get('mean', GaussianArgs.mean),
                                       cov=kwargs.get('cov', GaussianArgs.cov),
                                       n_classes=kwargs.get('n_classes', GaussianArgs.n_classes),
                                       shuffle=kwargs.get('shuffle', GaussianArgs.shuffle),
                                       random_state=kwargs.get('random_state', GaussianArgs.random_state))
    elif style == 'hastie':
        return make_hastie_10_2(n_samples,
                                random_state=kwargs.get('random_state', HastieArgs.random_state))
    elif style == 'circles':
        return make_circles(n_samples,
                            kwargs.get('shuffle', CirclesArgs.shuffle),
                            kwargs.get('noise', CirclesArgs.noise),
                            kwargs.get('random_state', CirclesArgs.random_state),
                            kwargs.get('factor', CirclesArgs.factor))
    elif style == 'moons':
        return make_moons(n_samples,
                          kwargs.get('shuffle', MoonsArgs.shuffle),
                          kwargs.get('noise', MoonsArgs.noise),
                          kwargs.get('random_state', MoonsArgs.random_state))
    elif style == 'biclusters':
        x = make_biclusters(kwargs.get('shape', BiclusterArgs.shape),
                            kwargs.get('n_clusters', BiclusterArgs.n_clusters),
                            kwargs.get('noise', BiclusterArgs.noise),
                            kwargs.get('minval', BiclusterArgs.minval),
                            kwargs.get('maxval', BiclusterArgs.maxval),
                            kwargs.get('shuffle', BiclusterArgs.shuffle),
                            kwargs.get('random_state', BiclusterArgs.random_state))
        if as_dataframe:
            return pd.concat([pd.DataFrame(x[0]), pd.DataFrame(x[1].T)], axis=1)
        else:
            return x

    elif style == 'scurve':
        return make_s_curve(n_samples,
                            kwargs.get('noise', SCurveArgs.noise),
                            kwargs.get('random_state', SCurveArgs.random_state))
    elif style == 'checker':
        return make_checkerboard(kwargs.get('shape', CheckerArgs.shape),
                                 kwargs.get('n_clusters', CheckerArgs.n_clusters),
                                 kwargs.get('noise', CheckerArgs.noise),
                                 kwargs.get('minval', CheckerArgs.minval),
                                 kwargs.get('maxval', CheckerArgs.maxval),
                                 kwargs.get('shuffle', CheckerArgs.shuffle),
                                 kwargs.get('random_state', CheckerArgs.random_state))
    elif style == 'friedman':
        return make_friedman1(n_samples,
                              kwargs.get('n_features', FriedmanArgs.n_features),
                              kwargs.get('noise', FriedmanArgs.noise),
                              kwargs.get('random_state', FriedmanArgs.random_state))
    elif style == 'friedman2':
        return make_friedman2(n_samples,
                              kwargs.get('noise', Friedman2Args.noise),
                              kwargs.get('random_state', Friedman2Args.random_state))
    elif style == 'friedman3':
        return make_friedman3(n_samples,
                              kwargs.get('noise', Friedman3Args.noise),
                              kwargs.get('random_state', Friedman3Args.random_state))
Beispiel #29
0
def getSKData(style='timeseries', n_samples=1, **kwargs):
    if isinstance(style, str):
        style = Style(style.lower())
    if style == Style.REGRESSION:
        return make_regression(
            n_samples, kwargs.get('n_features', RegressionArgs.n_features),
            kwargs.get('n_informative', RegressionArgs.n_informative),
            kwargs.get('n_targets', RegressionArgs.n_targets),
            kwargs.get('bias', RegressionArgs.bias),
            kwargs.get('effective_rank', RegressionArgs.effective_rank),
            kwargs.get('tail_strength', RegressionArgs.tail_strength),
            kwargs.get('noise', RegressionArgs.noise),
            kwargs.get('shuffle', RegressionArgs.shuffle),
            kwargs.get('coef', RegressionArgs.coef),
            kwargs.get('random_state', RegressionArgs.random_state))
    elif style == Style.BLOBS:
        return make_blobs(n_samples,
                          kwargs.get('n_features', BlobsArgs.n_features),
                          kwargs.get('centers', BlobsArgs.centers),
                          kwargs.get('cluster_std', BlobsArgs.cluster_std),
                          kwargs.get('center_box', BlobsArgs.center_box),
                          kwargs.get('shuffle', BlobsArgs.shuffle),
                          kwargs.get('random_state', BlobsArgs.random_state))
    elif style == Style.CLASSIFICATION:
        return make_classification(
            n_samples, kwargs.get('n_features', ClassificationArgs.n_features),
            kwargs.get('n_informative', ClassificationArgs.n_informative),
            kwargs.get('n_redundant', ClassificationArgs.n_redundant),
            kwargs.get('n_repeated', ClassificationArgs.n_repeated),
            kwargs.get('n_classes', ClassificationArgs.n_classes),
            kwargs.get('n_clusters_per_class',
                       ClassificationArgs.n_clusters_per_class),
            kwargs.get('weights', ClassificationArgs.weights),
            kwargs.get('flip_y', ClassificationArgs.flip_y),
            kwargs.get('class_sep', ClassificationArgs.class_sep),
            kwargs.get('hypercube', ClassificationArgs.hypercube),
            kwargs.get('shift', ClassificationArgs.shift),
            kwargs.get('scale', ClassificationArgs.scale),
            kwargs.get('shuffle', ClassificationArgs.shuffle),
            kwargs.get('random_state', ClassificationArgs.random_state))
    elif style == Style.MULTILABEL:
        return make_multilabel_classification(
            n_samples,
            kwargs.get('n_features', MultilabelClassificationArgs.n_features),
            kwargs.get('n_classes', MultilabelClassificationArgs.n_classes),
            kwargs.get('n_labels', MultilabelClassificationArgs.n_labels),
            kwargs.get('length', MultilabelClassificationArgs.length),
            kwargs.get('allow_unlabeled',
                       MultilabelClassificationArgs.allow_unlabeled),
            kwargs.get('sparse', MultilabelClassificationArgs.sparse),
            kwargs.get('return_indicator',
                       MultilabelClassificationArgs.return_indicator),
            kwargs.get('return_distributions',
                       MultilabelClassificationArgs.return_distributions),
            kwargs.get('random_state',
                       MultilabelClassificationArgs.random_state))
    elif style == Style.GAUSSIAN:
        return make_gaussian_quantiles(
            n_samples=n_samples,
            n_features=kwargs.get('n_features', GaussianArgs.n_features),
            mean=kwargs.get('mean', GaussianArgs.mean),
            cov=kwargs.get('cov', GaussianArgs.cov),
            n_classes=kwargs.get('n_classes', GaussianArgs.n_classes),
            shuffle=kwargs.get('shuffle', GaussianArgs.shuffle),
            random_state=kwargs.get('random_state', GaussianArgs.random_state))
    elif style == Style.HASTIE:
        return make_hastie_10_2(n_samples,
                                random_state=kwargs.get(
                                    'random_state', HastieArgs.random_state))
    elif style == Style.CIRCLES:
        return make_circles(
            n_samples, kwargs.get('shuffle', CirclesArgs.shuffle),
            kwargs.get('noise', CirclesArgs.noise),
            kwargs.get('random_state', CirclesArgs.random_state),
            kwargs.get('factor', CirclesArgs.factor))
    elif style == Style.MOONS:
        return make_moons(n_samples, kwargs.get('shuffle', MoonsArgs.shuffle),
                          kwargs.get('noise', MoonsArgs.noise),
                          kwargs.get('random_state', MoonsArgs.random_state))
    elif style == Style.BICLUSTERS:
        return make_biclusters(
            kwargs.get('shape', BiclusterArgs.shape),
            kwargs.get('n_clusters', BiclusterArgs.n_clusters),
            kwargs.get('noise', BiclusterArgs.noise),
            kwargs.get('minval', BiclusterArgs.minval),
            kwargs.get('maxval', BiclusterArgs.maxval),
            kwargs.get('shuffle', BiclusterArgs.shuffle),
            kwargs.get('random_state', BiclusterArgs.random_state))
    elif style == Style.SCURVE:
        return make_s_curve(
            n_samples, kwargs.get('noise', SCurveArgs.noise),
            kwargs.get('random_state', SCurveArgs.random_state))
    elif style == Style.CHECKER:
        return make_checkerboard(
            kwargs.get('shape', CheckerArgs.shape),
            kwargs.get('n_clusters', CheckerArgs.n_clusters),
            kwargs.get('noise', CheckerArgs.noise),
            kwargs.get('minval', CheckerArgs.minval),
            kwargs.get('maxval', CheckerArgs.maxval),
            kwargs.get('shuffle', CheckerArgs.shuffle),
            kwargs.get('random_state', CheckerArgs.random_state))
    elif style == Style.FRIEDMAN:
        return make_friedman1(
            n_samples, kwargs.get('n_features', FriedmanArgs.n_features),
            kwargs.get('noise', FriedmanArgs.noise),
            kwargs.get('random_state', FriedmanArgs.random_state))
    elif style == Style.FRIEDMAN2:
        return make_friedman2(
            n_samples, kwargs.get('noise', Friedman2Args.noise),
            kwargs.get('random_state', Friedman2Args.random_state))
    elif style == Style.FRIEDMAN3:
        return make_friedman3(
            n_samples, kwargs.get('noise', Friedman3Args.noise),
            kwargs.get('random_state', Friedman3Args.random_state))
Beispiel #30
0
Created on Wed Apr 17 18:50:49 2019

author: Fabio Sigrist
"""

import sklearn.datasets as datasets
import numpy as np
import KTBoost.KTBoost as KTBoost
import random
"""
Example 1
"""
# simulate data
random.seed(10)
n = 1000
X, lp = datasets.make_friedman3(n_samples=n)
X_test, y_test = datasets.make_friedman3(n_samples=n)
lp = lp * 5 + 0.2
y_test = y_test * 5 + 0.2
y = np.random.normal(loc=lp, scale=1)
# apply censoring
yu = 8
yl = 5
y[y >= yu] = yu
y[y <= yl] = yl

# train model and make predictions
model = KTBoost.BoostingRegressor(loss='tobit', yl=yl, yu=yu).fit(X, y)
y_pred = model.predict(X_test)
# mean square error (approx. 0.44 for n=1000)
print("Test error Grabit: " + str(((y_pred - y_test)**2).mean()))