Example #1
0
def task1(sample_size, dimension_size):
    X, y = datasets.make_sparse_uncorrelated(sample_size, dimension_size)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X, y, test_size=0.3, train_size=0.7)
    reg = linear_model.SGDRegressor(max_iter=1000)
    reg.fit(X_train, y_train)
    y_predict = reg.predict(X_test)
    error = metrics.mean_squared_error(y_test, y_predict)
    return error

    time_list = list()
    error_list = list()

    for r in [100, 1000, 2000]:
        for i in range(0, 5):
            start = time.time()
            error_list.append(task1(10000, r))
            stop = time.time()
            time_list.append(stop - start)
        print(
            str(r) + ": Error:" + str(sum(error_list) / len(error_list)) +
            "-time:" + str(sum(time_list)))

    for r in [100000, 250000, 500000]:
        for i in range(0, 5):
            start = time.time()
            error_list.append(task1(10000, r))
            stop = time.time()
            time_list.append(stop - start)
        print(
            str(r) + ": Error:" + str(sum(error_list) / len(error_list)) +
            "-time:" + str(sum(time_list)))
    def generators_for_regression_datasets(self):
        """
            Generators for regression
            sparse random linear combination of random features, with noise
            make_sparse_uncorrelated
        """

        logging.debug('----------------- Generators for regression  -----------')
        print('sparse_uncorrelated ' , datasets.make_sparse_uncorrelated())
Example #3
0
def test_linear_regression_positive_vs_nonpositive():
    # Test differences with LinearRegression when positive=False.
    X, y = make_sparse_uncorrelated(random_state=0)

    reg = LinearRegression(positive=True)
    reg.fit(X, y)
    regn = LinearRegression(positive=False)
    regn.fit(X, y)

    assert np.mean((reg.coef_ - regn.coef_)**2) > 1e-3
def get_data(dataset_name):
    print("Getting dataset: %s" % dataset_name)

    if dataset_name == "lfw_people":
        X = fetch_lfw_people().data
    elif dataset_name == "20newsgroups":
        X = fetch_20newsgroups_vectorized().data[:, :100000]
    elif dataset_name == "olivetti_faces":
        X = fetch_olivetti_faces().data
    elif dataset_name == "rcv1":
        X = fetch_rcv1().data
    elif dataset_name == "CIFAR":
        if handle_missing_dataset(CIFAR_FOLDER) == "skip":
            return
        X1 = [
            unpickle("%sdata_batch_%d" % (CIFAR_FOLDER, i + 1))
            for i in range(5)
        ]
        X = np.vstack(X1)
        del X1
    elif dataset_name == "SVHN":
        if handle_missing_dataset(SVHN_FOLDER) == 0:
            return
        X1 = sp.io.loadmat("%strain_32x32.mat" % SVHN_FOLDER)["X"]
        X2 = [X1[:, :, :, i].reshape(32 * 32 * 3) for i in range(X1.shape[3])]
        X = np.vstack(X2)
        del X1
        del X2
    elif dataset_name == "low rank matrix":
        X = make_low_rank_matrix(
            n_samples=500,
            n_features=int(1e4),
            effective_rank=100,
            tail_strength=0.5,
            random_state=random_state,
        )
    elif dataset_name == "uncorrelated matrix":
        X, _ = make_sparse_uncorrelated(n_samples=500,
                                        n_features=10000,
                                        random_state=random_state)
    elif dataset_name == "big sparse matrix":
        sparsity = int(1e6)
        size = int(1e6)
        small_size = int(1e4)
        data = np.random.normal(0, 1, int(sparsity / 10))
        data = np.repeat(data, 10)
        row = np.random.uniform(0, small_size, sparsity)
        col = np.random.uniform(0, small_size, sparsity)
        X = sp.sparse.csr_matrix((data, (row, col)), shape=(size, small_size))
        del data
        del row
        del col
    else:
        X = fetch_openml(dataset_name, parser="auto").data
    return X
Example #5
0
def task2_sampling(sample_size, dimension_size, new_sample_size):
    X, y = datasets.make_sparse_uncorrelated(sample_size, dimension_size)
    X_new, y_new = resample(X, y, n_samples=new_sample_size, replace=False)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        X_new, y_new, test_size=0.3, train_size=0.7)
    start = time.time()
    reg = linear_model.SGDRegressor(max_iter=1000)
    reg.fit(X_train, y_train)
    y_predict = reg.predict(X_test)
    error = metrics.mean_squared_error(y_test, y_predict)
    stop = time.time()
    return error, stop - start
Example #6
0
def task2_pca(sample_size, dimension_size, component_size):
    X, y = datasets.make_sparse_uncorrelated(sample_size, dimension_size)
    pca = decomposition.PCA(n_components=component_size)
    pca.fit(X)
    new_X = pca.transform(X)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        new_X, y, test_size=0.3, train_size=0.7)
    start = time.time()
    reg = linear_model.SGDRegressor(max_iter=1000)
    reg.fit(X_train, y_train)
    y_predict = reg.predict(X_test)
    error = metrics.mean_squared_error(y_test, y_predict)
    stop = time.time()
    return error, stop - start
Example #7
0
def test_linear_regression_positive_multiple_outcome(random_state=0):
    # Test multiple-outcome nonnegative linear regressions
    random_state = check_random_state(random_state)
    X, y = make_sparse_uncorrelated(random_state=random_state)
    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    ols = LinearRegression(positive=True)
    ols.fit(X, Y)
    assert ols.coef_.shape == (2, n_features)
    assert np.all(ols.coef_ >= 0.0)
    Y_pred = ols.predict(X)
    ols.fit(X, y.ravel())
    y_pred = ols.predict(X)
    assert_allclose(np.vstack((y_pred, y_pred)).T, Y_pred)
Example #8
0
def test_linear_regression_sparse_multiple_outcome(random_state=0):
    # Test multiple-outcome linear regressions with sparse data
    random_state = check_random_state(random_state)
    X, y = make_sparse_uncorrelated(random_state=random_state)
    X = sparse.coo_matrix(X)
    Y = np.vstack((y, y)).T
    n_features = X.shape[1]

    ols = LinearRegression()
    ols.fit(X, Y)
    assert ols.coef_.shape == (2, n_features)
    Y_pred = ols.predict(X)
    ols.fit(X, y.ravel())
    y_pred = ols.predict(X)
    assert_array_almost_equal(np.vstack((y_pred, y_pred)).T, Y_pred, decimal=3)
Example #9
0
def test_linear_regression_sparse_multiple_outcome(setup, random_state=0):
    # Test multiple-outcome linear regressions with sparse data
    random_state = check_random_state(random_state)
    X, y = make_sparse_uncorrelated(random_state=random_state)
    X = sparse.coo_matrix(X)
    Y = np.vstack((y, y)).T

    ols = LinearRegression()
    error_msg = re.escape("Does not support sparse input!")
    with pytest.raises(NotImplementedError, match=error_msg):
        ols.fit(X, Y)

    error_msg = re.escape("Does not support sparse input!")
    with pytest.raises(NotImplementedError, match=error_msg):
        ols.fit(X, y.ravel())
Example #10
0
def test_residualize_linear():
    """sanity checks on implementation"""

    min_dim = 6  # atleast 4+ required for make_sparse_uncorrelated
    max_dim = 100
    for n_samples in np.random.randint(20, 500, 3):
        for num_confounds in np.random.randint(min_dim, max_dim, 3):
            train_all, train_y = make_sparse_uncorrelated(
                n_samples=n_samples, n_features=min_dim + num_confounds + 1)

            train_X, train_confounds = splitter_X_confounds(train_all, num_confounds)

            resid = Residualize(model='linear')
            resid.fit(train_X, train_confounds)

            residual_train_X = resid.transform(train_X, train_confounds)

            # residual_train_X and train_confounds must be orthogonal now!
            assert_almost_equal(residual_train_X.T.dot(train_confounds), 0)
def test_make_sparse_uncorrelated():
    X, y = make_sparse_uncorrelated(n_samples=5, n_features=10, random_state=0)

    assert_equal(X.shape, (5, 10), "X shape mismatch")
    assert_equal(y.shape, (5,), "y shape mismatch")
Example #12
0
def test_make_sparse_uncorrelated():
    X, y = make_sparse_uncorrelated(n_samples=5, n_features=10, random_state=0)

    assert X.shape == (5, 10), "X shape mismatch"
    assert y.shape == (5, ), "y shape mismatch"
Example #13
0
from sklearn import datasets
import matplotlib.pyplot as plt

# make_sparse_uncorrelated data
X,y = datasets.make_sparse_uncorrelated(
    n_samples=100,n_features=10,random_state=None)

print('X = ')
print(X)
print('y = ')
print(y)
print("the output of make_hastie_10_2() :: ", datasets.make_hastie_10_2())

#make_moons() executed
print("the output of make_moons() :: ", datasets.make_moons())

#make_multilabel_classification() executed
print("the output of make_multilabel_classification() :: ",
      datasets.make_multilabel_classification())

#make_regression() executed
print("the output of make_regression() :: ", datasets.make_regression())

#make_sparse_spd_matrix() executed
print("the output of make_sparse_spd_matrix() :: ",
      datasets.make_sparse_spd_matrix())

#make_sparse_uncorrelated() executed
print("the output of make_sparse_uncorrelated() :: ",
      datasets.make_sparse_uncorrelated())

#make_sparse_uncorrelated() executed
print("the output of make_sparse_uncorrelated() :: ",
      datasets.make_sparse_uncorrelated())

#make_swiss_roll() executed
print("the output of make_swiss_roll() :: ", datasets.make_swiss_roll())

#mldata_filename() executed
print("the output of mldata_filename() :: ",
      datasets.mldata_filename('iris.txt'))
Example #15
0
random_state = 414
saving_fig = False  # set to True to save images

# dataset = "synthetic_unco"  # Fig a
dataset = "synthetic"  # Fig b

if dataset is "synthetic":
    n_samples, n_features = (500, 5000)
    X, y = make_regression(n_samples=n_samples,
                           n_features=n_features,
                           random_state=random_state)

if dataset is "synthetic_unco":
    n_samples, n_features = (30, 50)
    X, y = make_sparse_uncorrelated(n_samples=n_samples,
                                    n_features=n_features,
                                    random_state=random_state)

X = X.astype(float)
y = y.astype(float)
X = np.asfortranarray(X)
y = np.asfortranarray(y)

n_samples, n_features = X.shape
X = np.asfortranarray(X)
y = np.asfortranarray(y)
X /= np.linalg.norm(X, axis=0)
y = (y - y.mean()) / y.std()

X_train, X_test, y_train, y_test =\
    train_test_split(X, y, test_size=0.30, random_state=random_state)
Example #16
0
# -*- encoding: utf-8 -*-
"""
8.5.1 线性回归
"""

from sklearn.datasets import make_sparse_uncorrelated
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split as tsplit
from sklearn import metrics

import matplotlib.pyplot as plt
import numpy as np

X, y = make_sparse_uncorrelated(n_samples=100, n_features=4)
X_train, X_test, y_train, y_test = tsplit(X, y, test_size=0.1)
reg = LinearRegression()  # 实例化最小二乘法线性回归模型
reg.fit(X_train, y_train)  # 训练
y_pred = reg.predict(X_test)  # 预测

print(y_pred)  # 预测结果
print(y_test)  # 实际结果

print(metrics.mean_squared_error(y_test, y_pred))  # 均方误差
print(metrics.r2_score(y_test, y_pred))  # 复相关系数
print(metrics.median_absolute_error(y_test, y_pred))  # 中位数绝对误差

plt.rcParams['font.sans-serif'] = ['FangSong']
plt.rcParams['axes.unicode_minus'] = False
plt.subplot(121)
plt.title('残差图')
plt.plot(y_pred - y_test, 'o')