Ejemplo n.º 1
0
def FriedmanDataset_1():
    d_train = datasets.make_friedman1(240, 10, 1)
    d_test = datasets.make_friedman1(1000, 10)

    features_train = d_train[0] + np.random.normal(0, 1, 240).reshape((240, 1))
    target_train = d_train[1]
    features_test = d_test[0]
    target_test = d_test[1]

    return features_train, target_train, features_test, target_test
Ejemplo n.º 2
0
def makedata():
  n_points = 500 # points
 
  X, y =  make_friedman1(n_samples=n_points, n_features=5, 
                         noise=1.0, random_state=100)
         
  return train_test_split(X, y, test_size=0.5, random_state=3)
def make_data_weights_biases(neurons, twolayers):
    X, y = make_friedman1(n_samples=1000,
                          n_features=5,
                          noise=0.0,
                          random_state=None)

    W_0 = np.random.rand(X.shape[1], neurons)
    b_0 = np.zeros((1, neurons))
    if twolayers:
        W_1 = np.random.rand(neurons, neurons)
        W_2 = np.random.rand(neurons, 1)
        b_1 = np.zeros((1, neurons))
        b_2 = np.zeros((1, 1))
    else:
        W_1 = np.random.rand(neurons, 1)
        b_1 = np.zeros((1, 1))
        W_2 = None
        b_2 = None

    print("X rows: " + repr(X.shape[0]) + ", " + "X columns: " +
          repr(X.shape[1]))
    print("Y rows: " + repr(y.shape))
    print("W_0: " + repr(W_0) + ", " + "W_1: " + repr(W_1) + "b_0: " +
          repr(b_0) + "b_1: " + repr(b_1))
    if twolayers:
        print("W_0: " + repr(W_0) + ", " + "W_1: " + repr(W_1) + "W_2: " +
              repr(W_2) + "b_0: " + repr(b_0) + "b_1: " + repr(b_1) + "b_2: " +
              repr(b_2))
        return X, y, W_0, W_1, W_2, b_0, b_1, b_2
    else:
        print("W_0: " + repr(W_0) + ", " + "W_1: " + repr(W_1) + "b_0: " +
              repr(b_0) + "b_1: " + repr(b_1))
        return X, y, W_0, W_1, b_0, b_1
Ejemplo n.º 4
0
def test_data():
    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
    xtrain, xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=.30,
                                                    random_state=0)
    return xtrain, xtest, ytrain, ytest
def test_regression_synthetic():
    """Test on synthetic regression datasets used in Leo Breiman,
    `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """
    random_state = check_random_state(1)
    regression_params = {'n_estimators': 100, 'max_depth': 4,
                         'min_samples_split': 1, 'learning_rate': 0.1,
                         'loss': 'ls'}

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor()
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse
Ejemplo n.º 6
0
def error_curves(estimator, parameter, parameter_values, n_repeat=100):
    all_train_errors = []
    all_test_errors = []

    for i in range(n_repeat):
        X, y = make_friedman1(n_samples=200)
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            train_size=0.7)

        train_errors = []
        test_errors = []

        for j, p in enumerate(parameter_values):
            est = estimator(**{parameter: p})
            est.fit(X_train, y_train)

            train_errors.append(
                mean_squared_error(y_train, est.predict(X_train)))
            test_errors.append(mean_squared_error(y_test, est.predict(X_test)))

        all_train_errors.append(train_errors)
        all_test_errors.append(test_errors)

    return all_train_errors, all_test_errors
Ejemplo n.º 7
0
    def test_fwls_regressor(self):
        feature_func = lambda x: np.ones(x.shape)
        bclf = LinearRegression()
        clfs = [
            RandomForestRegressor(n_estimators=50, random_state=1),
            GradientBoostingRegressor(n_estimators=25, random_state=1),
            Ridge(random_state=1)
        ]

        # Friedman1
        X, y = datasets.make_friedman1(n_samples=1200,
                                       random_state=1,
                                       noise=1.0)
        X_train, y_train = X[:200], y[:200]
        X_test, y_test = X[200:], y[200:]

        sr = FWLSRegressor(bclf,
                           clfs,
                           feature_func,
                           n_folds=3,
                           verbose=0,
                           oob_score_flag=True)
        sr.fit(X_train, y_train)
        mse = mean_squared_error(y_test, sr.predict(X_test))
        assert_less(mse, 6.0)
Ejemplo n.º 8
0
 def setUp(self):
     # Friedman1
     self.X, self.y = datasets.make_friedman1(n_samples=500,
                                              random_state=1,
                                              noise=1.0)
     self.X_train, self.y_train = self.X[:400], self.y[:400]
     self.X_test, self.y_test = self.X[400:], self.y[400:]
Ejemplo n.º 9
0
    def __init__(self, numFeatures, numSamples, randomSeed):
        """
        :param numFeatures: total number of features to be used (at least 5)
        :param numSamples: number of samples in dataset
        :param randomSeed: random seed value used for reproducible results
        """

        self.numFeatures = numFeatures
        self.numSamples = numSamples
        self.randomSeed = randomSeed

        # generate test data:
        self.X, self.y = datasets.make_friedman1(n_samples=self.numSamples,
                                                 n_features=self.numFeatures,
                                                 noise=self.NOISE,
                                                 random_state=self.randomSeed)

        # divide the data to a training set and a validation set:
        self.X_train, self.X_validation, self.y_train, self.y_validation = \
            model_selection.train_test_split(self.X, self.y, test_size=self.VALIDATION_SIZE, random_state=self.randomSeed)
        #        print(self.X_train)
        #        print(self.y_train)

        #        np.savetxt('testX.out', (self.X_train))
        #        np.savetxt('testY.out', (self.y_train))
        self.regressor = GradientBoostingRegressor(
            random_state=self.randomSeed)
Ejemplo n.º 10
0
 def _create_test_data(self):
     X, y = datasets.make_friedman1(n_samples=20, random_state=13)
     X = pd.DataFrame(X)
     Y = Response.from_array(y / y.max())
     Z = Partition(size=X.shape[0], folds=5, reps=1, total_size=X.shape[0])
     Z.set(max_reps=1, max_folds=0)
     return Container(X), Y, Z
Ejemplo n.º 11
0
    def test_fit(self):

        n_samples = 10000
        test_size = 0.2
        max_depth = 3
        lr = 0.1
        n_est = 100

        X, y = make_friedman1(n_samples=n_samples)
        n, m = X.shape
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size)

        model = GBM(distribution="gaussian",
                    n_estimators=n_est,
                    learning_rate=lr,
                    max_depth=max_depth)

        model.fit(X_train, y_train)
        y_hat = model.predict(X_test)

        mse_gbm = np.mean((y_test - y_hat)**2)
        mse_baseline = np.mean((y_test - np.mean(y_train))**2)

        self.assertTrue(mse_gbm < mse_baseline)
Ejemplo n.º 12
0
def run():
    """Run profiling."""
    lc = LayerGenerator().get_sequential('stack', False, False)

    cm = CMLog(verbose=False)
    cm.monitor()

    sleep(5)

    t1 = int(np.floor(perf_counter() - cm._t0) * 10)
    sleep(0.1)
    x, z = make_friedman1(int(5 * 1e6))

    sleep(5)

    t2 = int(np.floor(perf_counter() - cm._t0) * 10)
    sleep(0.1)
    lc.fit(x, z)
    t3 = int(np.floor(perf_counter() - cm._t0) * 10)

    sleep(5)

    while not hasattr(cm, 'cpu'):
        cm.collect()
        sleep(1)

    return cm, t1, t2, t3
Ejemplo n.º 13
0
def test():

    X, y = make_friedman1(n_samples=1000)
    n, m = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    reg_params = {"subsample": 0.8, "max_depth": 4}
    rf = RandomForest(base_estimator=RegTree,
                      base_params=reg_params,
                      n_estimators=20)

    print("\n")
    print("-----------------------------------------------------")

    # Fit
    rf.fit(X_train, y_train)

    # Predict
    y_hat_default = rf.predict(X_test)

    y_hat_script = np.zeros(y_test.shape[0])
    for i, x in enumerate(X_test):
        y_hat_script[i] = apply_randomforest(x, rf.dump())

    # Error
    match_rate = np.mean((y_hat_default - y_hat_script) < 1e-12)
    print("match_rate: {0:.5f} %".format(match_rate * 100))
    print("-----------------------------------------------------")
    print("\n")
Ejemplo n.º 14
0
 def setUp(self):
     # Friedman1
     self.X, self.y = datasets.make_friedman1(n_samples=500,
                                              random_state=1,
                                              noise=1.0)
     self.X_train, self.y_train = self.X[:400], self.y[:400]
     self.X_test, self.y_test = self.X[400:], self.y[400:]
Ejemplo n.º 15
0
def main():
    dir = sys.argv[1]
    output_csv = dir + '/friedman1/friedman1_prep.csv'
    names = ["x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "y"]

    (X, y) = data.make_friedman1(n_samples=10000,
                                 random_state=123456,
                                 noise=1.0)
    y = np.matrix(y).T
    df = pd.DataFrame(np.append(X, y, axis=1), columns=names)
    df = scale(df)[1]

    # TODO Transform box-cox.
    lambdas = {
        'x1': 0.73772299748812553,
        'x10': 0.81728280581171431,
        'x2': 0.80698183857607453,
        'x3': 0.73814877672198154,
        'x4': 0.65907211104558194,
        'x5': 0.88664969513868797,
        'x6': 0.78156577216859524,
        'x7': 0.73707418190834051,
        'x8': 0.77589583265069417,
        'x9': 0.80351813801046301
    }
    df = transform_cox(df, lambdas)

    df.to_csv(output_csv, index=False)
Ejemplo n.º 16
0
def make_regression_dataset(dataset, n_rows, n_cols):
    np.random.seed(137)
    if dataset == 'reg1':
        X, y = make_regression(n_rows,
                               n_cols,
                               n_informative=2,
                               n_targets=1,
                               random_state=137)
    elif dataset == 'reg2':
        X, y = make_regression(n_rows,
                               n_cols,
                               n_informative=2,
                               n_targets=1,
                               random_state=137,
                               noise=10)
    elif dataset == 'Friedman':
        X, y = make_friedman1(n_samples=n_rows,
                              n_features=n_cols,
                              noise=0.0,
                              random_state=137)
    else:
        raise ValueError('Wrong option for dataste: ', dataset)
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    dtype = np.float32
    X = X.astype(dtype)
    y = y.astype(dtype)
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    return X_train, X_test, y_train, y_test
def test_regression_synthetic():
    """Test on synthetic regression datasets used in Leo Breiman,
    `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996). """
    random_state = check_random_state(1)
    regression_params = {'n_estimators': 100, 'max_depth': 4,
                         'min_samples_split': 1, 'learning_rate': 0.1,
                         'loss': 'ls'}

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor()
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 5.0, "Failed on Friedman1 with mse = %.4f" % mse

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 1700.0, "Failed on Friedman2 with mse = %.4f" % mse

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]
    clf = GradientBoostingRegressor(**regression_params)
    clf.fit(X_train, y_train)
    mse = mean_squared_error(y_test, clf.predict(X_test))
    assert mse < 0.015, "Failed on Friedman3 with mse = %.4f" % mse
def test_simulated_annealing():
    """
    This test creates a dataset that has 5 features that
    are are used to compute `y`. The remaining 5 features are
    independent of `y`. This test should select the first 5
    feature columns used to compute `y` more than the second set
    of 5 independent features.
    """
    X, y = make_friedman1(n_samples=200, n_features=10, random_state=10)
    N = 10
    results = np.zeros((N, X.shape[1]))
    for n in range(0, N):
        results[n] = simulated_annealing(scorer, X, y, bools=True)
    assert results.sum(axis=0)[0] >= results.sum(axis=0)[5]
    assert results.sum(axis=0)[1] >= results.sum(axis=0)[6]
    # Omit feature 2 because weaker strength and harder to detec
    assert results.sum(axis=0)[3] >= results.sum(axis=0)[8]
    assert results.sum(axis=0)[4] >= results.sum(axis=0)[9]

    # Test output is non empty
    features = simulated_annealing(scorer, X, y)
    assert len(features) > 0
    features = simulated_annealing(scorer, X, y, bools=True)
    assert len(features) > 0

    # Test outputs are correct types
    features = simulated_annealing(scorer, X, y)
    assert isinstance(features[0], np.int64)
    features = simulated_annealing(scorer, X, y, bools=True)
    assert isinstance(features[0], np.bool_)
def gradient_boosting(features_values_temp, rows_temp, columns_temp, prediction_values_temp, kernel, threshold):
	#kernel: linear, poly, rbf, sigmoid, precomputed

	rows = 0
	while rows_temp > 0:
		rows = rows + 1
		rows_temp = rows_temp - 1

	columns = 0
	while columns_temp > 0:
		columns = columns + 1
		columns_temp = columns_temp - 1

	features_values = [x for x in features_values_temp]
	prediction_values = [y for y in prediction_values_temp]



	rotated = convert_list_to_matrix(features_values, rows, columns)
	scores = np.array(prediction_values)

	threshold = float(threshold)

	estimator = SVR(kernel=kernel) # try to change to the model for which the test is gonna run (lasso, ridge, etc.)

	 X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
	 X_train, X_test = X[:200], X[200:]
	 y_train, y_test = y[:200], y[200:]
	 est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)
	 mean_squared_error(y_test, est.predict(X_test)) 
Ejemplo n.º 20
0
def generate_without_missing_values(data='simple', n_samples=200,
                                    n_features=2, random_state=0):
    """generate canonical regression data"""

    assert data in ['simple', 'linear', 'quadratic', 'friedman']
    np.random.seed(random_state)
    mean = np.ones(n_features)
    ro = .5
    cov = ro * np.ones((n_features, n_features)) +\
          (1 - ro) * np.eye(n_features)
    X = np.random.multivariate_normal(mean, cov, size=n_samples)
    epsilon = 0.1 * np.random.randn(n_samples)

    if data == 'simple':
        y = X[:, 0] + epsilon
    if data == 'linear':
        beta = [1, 2] + list(np.random.randn(n_features-2))
        y = X.dot(beta) + epsilon
    if data == 'quadratic':
        y = X[:, 0] * X[:, 0] + epsilon
    if data == 'friedman':  # X is no more gaussian here
        X, y = make_friedman1(n_samples=n_samples,
                              n_features=max(5, n_features),
                              noise=0.1, random_state=random_state)
    return X, y
Ejemplo n.º 21
0
def generate_friedman1(seed):
    (X, y) = data.make_friedman1(n_samples=2000, random_state=seed, noise=1.0)

    # transform values to DataMatrix/DataVector types
    X = sg.DataMatrix(X)
    y = sg.DataVector(y)

    return (X, y)
Ejemplo n.º 22
0
def test_regression():

    X, y = make_friedman1(n_samples=100000, noise=5)
    n, m = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    models = {
        "palobst":
        PaloBoost(
            distribution="gaussian",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "gbm":
        GBM(
            distribution="gaussian",
            n_estimators=100,
            learning_rate=1.0,
            max_depth=4,
            subsample=0.5,
        ),
        "sklearn":
        GradientBoostingRegressor(n_estimators=100,
                                  learning_rate=1.0,
                                  max_depth=4,
                                  subsample=0.5),
    }

    print("\n")
    print("# Test Regression")
    print("-----------------------------------------------------")
    print(" model_name     train_time     predict_time   rmse   ")
    print("-----------------------------------------------------")
    print(" {0:12}   {1:12}   {2:12}   {3:.5f}".format("baseline", "-", "-",
                                                       np.std(y_test)))

    for name, model in models.items():

        # Fit
        start = time.time()
        model.fit(X_train, y_train)
        time_fit = time.time() - start

        # Predict
        start = time.time()
        y_hat = model.predict(X_test)
        time_pred = time.time() - start

        # Error
        rmse = np.sqrt(np.mean((y_test - y_hat)**2))

        print(" {0:12}   {1:.5f} sec    {2:.5f} sec    {3:.5f}".format(
            name, time_fit, time_pred, rmse))

    print("-----------------------------------------------------")
    print("\n")
Ejemplo n.º 23
0
def test():
    from sklearn.datasets import make_friedman1
    from sklearn.svm import SVR
    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
    estimator = SVR(kernel="linear")
    selector = RFECVp(estimator, step=1, cv=5)
    selector = selector.fit(X, y)
    print selector.support_ # doctest: +NORMALIZE_WHITESPACE
    print selector.ranking_
Ejemplo n.º 24
0
def friedman1(n_samples=100,
              n_features=10,
              noise=0.0,
              random_state=None):

    return datasets.make_friedman1(n_samples=n_samples,
                                   n_features=n_features,
                                   noise=noise,
                                   random_state=random_state)
Ejemplo n.º 25
0
def test_make_friedman1():
    X, y = make_friedman1(n_samples=5, n_features=10, noise=0.0, random_state=0)

    assert_equal(X.shape, (5, 10), "X shape mismatch")
    assert_equal(y.shape, (5,), "y shape mismatch")

    assert_array_almost_equal(
        y, 10 * np.sin(np.pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4]
    )
Ejemplo n.º 26
0
def create_complex_regression_dataset(plot=False):
    from sklearn.datasets import make_friedman1

    X, y = make_friedman1(n_samples=100, n_features=7, random_state=0)
    if plot:
        plt.figure()
        plt.title("Complex regression problem with one input variable")
        plt.scatter(X[:, 2], y, marker="o", s=50)
        plt.show()
    return X, y
Ejemplo n.º 27
0
def test_rfe_importance_getter_validation(importance_getter, err_type, Selector):
    X, y = make_friedman1(n_samples=50, n_features=10, random_state=42)
    estimator = LinearSVR()
    log_estimator = TransformedTargetRegressor(
        regressor=estimator, func=np.log, inverse_func=np.exp
    )

    with pytest.raises(err_type):
        model = Selector(log_estimator, importance_getter=importance_getter)
        model.fit(X, y)
Ejemplo n.º 28
0
def test_select_from_model_pls(PLSEstimator):
    """Check the behaviour of SelectFromModel with PLS estimators.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12410
    """
    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
    estimator = PLSEstimator(n_components=1)
    model = make_pipeline(SelectFromModel(estimator), estimator).fit(X, y)
    assert model.score(X, y) > 0.5
def load_toy_dataset():
    X, Y = make_friedman1(n_samples=200, n_features=15)
    # X = [
    #     [1,1,1,1,1],
    #     [2,2,2,2,2],
    #     [3,3,3,3,3],
    # ]
    # Y = [1.1,2.2,3.3]

    return np.asarray(X), np.asarray(Y)
Ejemplo n.º 30
0
def test_make_friedman1():
    X, y = make_friedman1(n_samples=5, n_features=10, noise=0.0,
                          random_state=0)

    assert_equal(X.shape, (5, 10), "X shape mismatch")
    assert_equal(y.shape, (5,), "y shape mismatch")

    assert_array_almost_equal(y, 10 * np.sin(np.pi * X[:, 0] * X[:, 1])
                                 + 20 * (X[:, 2] - 0.5) ** 2 \
                                 + 10 * X[:, 3] + 5 * X[:, 4])
Ejemplo n.º 31
0
def get_friedman():
    n_samples = 10000
    noise = 5
    X, y = make_friedman1(n_samples=n_samples, noise=noise)
    poly = PolynomialFeatures(degree=2,
                              include_bias=False,
                              interaction_only=True)
    X = poly.fit_transform(X)
    print(X.shape)
    return X, y
Ejemplo n.º 32
0
def test_rfe_pls(ClsRFE, PLSEstimator):
    """Check the behaviour of RFE with PLS estimators.

    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12410
    """
    X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
    estimator = PLSEstimator(n_components=1)
    selector = ClsRFE(estimator, step=1).fit(X, y)
    assert selector.score(X, y) > 0.5
Ejemplo n.º 33
0
def make_sample():
    """
    Return (X_train, X_test, y_train, y_test)
    """
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]

    result = (X_train, X_test, y_train, y_test)

    return result
Ejemplo n.º 34
0
def test():

    X, y = make_friedman1(n_samples=10000)
    # X, y = make_friedman2(n_samples=100000)
    # X, y = make_friedman3(n_samples=100000)
    n, m = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    models = {
        "bonsai-reg": RegTree(max_depth=3),
        "bonsai-xgb": XGBTree(max_depth=3),
        "sklearn": DecisionTreeRegressor(max_depth=3),
    }

    print("\n")
    print("-----------------------------------------------------")
    print(" model_name     train_time     predict_time   rmse   ")
    print("-----------------------------------------------------")
    print(" {0:12}   {1:12}   {2:12}   {3:.5f}".format("baseline", "-", "-",
                                                       np.std(y_test)))

    for name, model in models.items():

        # Fit
        start = time.time()
        model.fit(X_train, y_train)
        time_fit = time.time() - start

        # Predict
        start = time.time()
        y_hat = model.predict(X_test)
        time_pred = time.time() - start

        # Error
        rmse = np.sqrt(np.mean((y_test - y_hat)**2))

        print(" {0:12}   {1:.5f} sec    {2:.5f} sec    {3:.5f}".format(
            name, time_fit, time_pred, rmse))

    print("-----------------------------------------------------")
    print("\n")

    print("-----------------------------------------------------")
    print(" model_name     feature_importances_   ")
    print("-----------------------------------------------------")
    for name, model in models.items():
        f_cnt = Counter(
            {i: v
             for i, v in enumerate(model.feature_importances_)})
        fi = ", ".join(
            ["{}:{:.3f}".format(i, v) for i, v in f_cnt.most_common(4)])
        print(" {0:12}   {1}".format(name, fi))
    print("-----------------------------------------------------")
    print("\n")
def test_recursive_feature_elimination():
    """
    This test creates a dataset that has 5 features that are are used
    to compute `y`. The remaining 5 features are independent of `y`.
    This test should select the 5 feature columns used to compute `y`.
    """
    X, y = make_friedman1(n_samples=200, n_features=10, random_state=10)

    features = recursive_feature_elimination(scorer,
                                             X,
                                             y,
                                             n_features_to_select=4)

    assert features == [0, 1, 3, 4]

    # Test with n_features_to_select something other than 0.5 number
    # of total features to ensure logic to stop feature elimination
    # works correctly.
    features = recursive_feature_elimination(scorer,
                                             X,
                                             y,
                                             n_features_to_select=3)

    assert len(features) == 3

    # Retest with column names
    X, y = make_friedman1(n_samples=200, n_features=10, random_state=10)

    X = pd.DataFrame(X,
                     columns=[
                         'zero', 'one', 'two', 'three', 'four', 'five', 'six',
                         'seven', 'eight', 'nine'
                     ])

    features = recursive_feature_elimination(scorer,
                                             X,
                                             y,
                                             n_features_to_select=4)

    assert features == ['zero', 'one', 'three', 'four']
Ejemplo n.º 36
0
def poly():

    plt.figure()
    plt.title('Complex regression problem with one input variable')
    X_F1, y_F1 = make_friedman1(n_samples=100, n_features=7, random_state=0)
    plt.scatter(X_F1[:, 2], y_F1, marker='o', s=50)
    plt.show()

    X_train, X_test, y_train, y_test = train_test_split(X_F1,
                                                        y_F1,
                                                        random_state=0)
    linreg = LinearRegression().fit(X_train, y_train)
    print('linear model coeff (w): {}'.format(linreg.coef_))
    print('linear model intercept (b): {:.3f}'.format(linreg.intercept_))
    print('R-squared score (training): {:.3f}'.format(
        linreg.score(X_train, y_train)))
    print('R-squared score (test): {:.3f}'.format(linreg.score(X_test,
                                                               y_test)))

    print(
        '\nNow we transform the original input data to add polynomial features up to degree 2 (quadratic)\n'
    )
    poly = PolynomialFeatures(degree=2)
    X_F1_poly = poly.fit_transform(X_F1)
    X_train, X_test, y_train, y_test = train_test_split(X_F1_poly,
                                                        y_F1,
                                                        random_state=0)
    linreg = LinearRegression().fit(X_train, y_train)
    print('(poly deg 2) linear model coeff (w):\n{}'.format(linreg.coef_))
    print('(poly deg 2) linear model intercept (b): {:.3f}'.format(
        linreg.intercept_))
    print('(poly deg 2) R-squared score (training): {:.3f}'.format(
        linreg.score(X_train, y_train)))
    print('(poly deg 2) R-squared score (test): {:.3f}\n'.format(
        linreg.score(X_test, y_test)))

    print(
        '\nAddition of many polynomial features often leads to overfitting, so we often use polynomial features in combination with regression that has a regularization penalty, like ridge regression.\n'
    )

    X_train, X_test, y_train, y_test = train_test_split(X_F1_poly,
                                                        y_F1,
                                                        random_state=0)
    linreg = Ridge().fit(X_train, y_train)
    print('(poly deg 2 + ridge) linear model coeff (w):\n{}'.format(
        linreg.coef_))
    print('(poly deg 2 + ridge) linear model intercept (b): {:.3f}'.format(
        linreg.intercept_))
    print('(poly deg 2 + ridge) R-squared score (training): {:.3f}'.format(
        linreg.score(X_train, y_train)))
    print('(poly deg 2 + ridge) R-squared score (test): {:.3f}'.format(
        linreg.score(X_test, y_test)))
Ejemplo n.º 37
0
    def genFriedman(self, i=1, N=240, D=10):
        if i not in range(1,4):
            raise Exception('not a correct dataset')

        if i == 1:
            X, Y = datasets.make_friedman1(N, D )

        if i == 2:
            X, Y = datasets.make_friedman2(N, D)

        if i == 3:
            X, Y = datasets.make_friedman3(N, D)
        return X, Y
def generate_baseline_data(include_cat):
    X, y = datasets.make_friedman1(NUM_SAMPLES, 5, 100, 1)

    # convert  to a binomial
    prob = 1 / (1 + np.exp(-y))
    y = np.random.binomial(1, prob)

    print('Event rate = {0:4.4f}'.format(np.sum(y) / NUM_SAMPLES))

    data = np.hstack((y.reshape(-1, 1), X))
    data = pd.DataFrame(data, columns=['y', 'x0', 'x1', 'x2', 'x3', 'x4'])

    if include_cat is True:
        data['c'] = data.apply(lambda row: 'A' if row.y == 1 else 'B', axis=1)

    return data
Ejemplo n.º 39
0
def make_sample():
    """
    Return (X_train, X_test, y_train, y_test)
    """
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]

    result = (
        X_train,
        X_test,
        y_train,
        y_test
    )

    return result
Ejemplo n.º 40
0
def rf_fear_test_home(n=10,n_trees=10):
    cblparallel.start_port_forwarding()
    # Data
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]
    # Params
    #local_temp_path = os.path.abspath('../temp/')
    #remote_temp_path = 'python/'
    # Write data file locally
    #data_file = mkstemp_safe(cblparallel.config.LOCAL_TEMP_PATH, '.p')
    data_file = mkstemp_safe(cblparallel.config.HOME_TEMP_PATH, '.p')
    with open(data_file, 'w') as f:
        pickle.dump((X_train, y_train, X_test), f)
    # Prepare code
    scripts = [reduced_tree_code % {'data_file' : os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1]),
                            'n_trees' : n_trees,
                            'random_state' : i * n_trees,
                            'output_file' : '%(output_file)s',
                            'flag_file' : '%(flag_file)s'} for i in range(n)]
    # Submit to fear
    with cblparallel.fear(via_gate=True) as fear:
        fear.copy_to(data_file, os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1]))
        output_files = cblparallel.run_batch_on_fear(scripts, max_jobs=1000)
        fear.rm(os.path.join(cblparallel.config.REMOTE_TEMP_PATH, os.path.split(data_file)[-1]))

    # Kill local data file
    os.remove(data_file)    

    # Now do something with the output

    estimators = []
    predictions = []

    for output_file in output_files:
        with open(output_file, 'r') as f:
            #(estimator, prediction) = pickle.load(f)
            prediction = np.genfromtxt(output_file, delimiter=',')
        os.remove(output_file)
        #estimators.append(estimator)
        predictions.append(prediction)

    #ens = EnsembleRegressor(estimators)
    #return RMSE(X_test, y_test, ens)

    ens_pred = np.mean(predictions, axis=0)
    return RMSE_y(y_test, ens_pred)
Ejemplo n.º 41
0
def rf_fear_test(n=10,n_trees=1000):
    # Data
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]
    # Params
    local_temp_path = os.path.abspath('../temp/')
    remote_temp_path = 'python/'
    # Write data file locally
    data_file = mkstemp_safe(local_temp_path, '.p')
    with open(data_file, 'w') as f:
        pickle.dump((X_train, y_train, X_test), f)
    # Prepare code
    scripts = [tree_code % {'data_file' : os.path.split(data_file)[-1],
                            'n_trees' : n_trees,
                            'random_state' : i * n_trees,
                            'output_file' : '%(output_file)s',
                            'flag_file' : '%(flag_file)s'} for i in range(n)]
    # Submit to fear
    with pyfear.fear() as fear:
        fear.copy_to(data_file, os.path.join(remote_temp_path, os.path.split(data_file)[-1]))
        output_files = pyfear.run_python_jobs(scripts, local_temp_path, remote_temp_path, fear)
        fear.rm(os.path.join(remote_temp_path, os.path.split(data_file)[-1]))

    # Kill local data file
    os.remove(data_file)    

    # Now do something with the output

    estimators = []
    predictions = []

    for output_file in output_files:
        with open(output_file, 'r') as f:
            #(estimator, prediction) = pickle.load(f)
            prediction = np.genfromtxt(output_file, delimiter=',')
        os.remove(output_file)
        #estimators.append(estimator)
        predictions.append(prediction)

    #ens = EnsembleRegressor(estimators)
    #return RMSE(X_test, y_test, ens)

    ens_pred = np.mean(predictions, axis=0)
    return RMSE_y(y_test, ens_pred)
def test_staged_predict():
    # Test whether staged decision function eventually gives
    # the same prediction.
    X, y = datasets.make_friedman1(n_samples=1200, random_state=1, noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test = X[200:]
    clf = GradientBoostingRegressor()
    # test raise ValueError if not fitted
    assert_raises(ValueError, lambda X: np.fromiter(clf.staged_predict(X), dtype=np.float64), X_test)

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    # test if prediction for last stage equals ``predict``
    for y in clf.staged_predict(X_test):
        assert_equal(y.shape, y_pred.shape)

    assert_array_equal(y_pred, y)
Ejemplo n.º 43
0
def test_regression_synthetic():
    # Test on synthetic regression datasets used in Leo Breiman,
    # `Bagging Predictors?. Machine Learning 24(2): 123-140 (1996).
    random_state = check_random_state(1)
    regression_params = {'n_estimators': 100, 'max_depth': 4,
                         'min_samples_split': 2, 'learning_rate': 0.1,
                         'loss': 'ls'}

    # Friedman1
    X, y = datasets.make_friedman1(n_samples=1200,
                                   random_state=random_state,
                                   noise=1.0)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        clf = GradientBoostingRegressor(presort=presort)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 5.0)

    # Friedman2
    X, y = datasets.make_friedman2(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        regression_params['presort'] = presort
        clf = GradientBoostingRegressor(**regression_params)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 1700.0)

    # Friedman3
    X, y = datasets.make_friedman3(n_samples=1200, random_state=random_state)
    X_train, y_train = X[:200], y[:200]
    X_test, y_test = X[200:], y[200:]

    for presort in True, False:
        regression_params['presort'] = presort
        clf = GradientBoostingRegressor(**regression_params)
        clf.fit(X_train, y_train)
        mse = mean_squared_error(y_test, clf.predict(X_test))
        assert_less(mse, 0.015)
Ejemplo n.º 44
0
    def test_regressor(self):
        X, y = datasets.make_friedman1(n_samples=1200,
                                       random_state=1,
                                       noise=1.0)
        X_train, y_train = X[:200], y[:200]
        index = [i for i in range(200)]

        rf = RandomForestRegressor()
        jrf = JoblibedRegressor(rf, "rfr", cache_dir='')
        jrf.fit(X_train, y_train, index)
        prediction = jrf.predict(X_train, index)
        mse = mean_squared_error(y_train, prediction)
        assert_less(mse, 6.0)

        rf = RandomForestRegressor(n_estimators=20)
        jrf = JoblibedRegressor(rf, "rfr", cache_dir='')
        jrf.fit(X_train, y_train, index)
        prediction2 = jrf.predict(X_train, index)
        assert_allclose(prediction, prediction2)
Ejemplo n.º 45
0
def local_forest_test(n=10,n_trees=10):
    # Data
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]
    # Params
#    local_temp_path = os.path.abspath('../temp/')
#    remote_temp_path = 'python/'
    # Write data file locally
    data_file = mkstemp_safe(cblparallel.config.HOME_TEMP_PATH, '.p')
    with open(data_file, 'w') as f:
        pickle.dump((X_train, y_train, X_test), f)
    # Prepare code
    scripts = [reduced_tree_code % {'data_file' : data_file,
                            'n_trees' : n_trees,
                            'random_state' : i * n_trees,
                            'output_file' : '%(output_file)s',
                            'flag_file' : '%(flag_file)s'} for i in range(n)]
    # Run bacth in parallel)
    output_files = cblparallel.run_batch_locally(scripts)

    # Kill local data file
    os.remove(data_file)    

    # Now do something with the output

    estimators = []
    predictions = []

    for output_file in output_files:
        with open(output_file, 'r') as f:
            #(estimator, prediction) = pickle.load(f)
            prediction = np.genfromtxt(output_file, delimiter=',')
        os.remove(output_file)
        #estimators.append(estimator)
        predictions.append(prediction)

    #ens = EnsembleRegressor(estimators)
    #return RMSE(X_test, y_test, ens)

    ens_pred = np.mean(predictions, axis=0)
    return RMSE_y(y_test, ens_pred)
Ejemplo n.º 46
0
def test_rfe_min_step():
    n_features = 10
    X, y = make_friedman1(n_samples=50, n_features=n_features, random_state=0)
    n_samples, n_features = X.shape
    estimator = SVR(kernel="linear")

    # Test when floor(step * n_features) <= 0
    selector = RFE(estimator, step=0.01)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)

    # Test when step is between (0,1) and floor(step * n_features) > 0
    selector = RFE(estimator, step=0.20)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)

    # Test when step is an integer
    selector = RFE(estimator, step=5)
    sel = selector.fit(X, y)
    assert_equal(sel.support_.sum(), n_features // 2)
Ejemplo n.º 47
0
    def test_stacked_regressor(self):
        bclf = LinearRegression()
        clfs = [RandomForestRegressor(n_estimators=50, random_state=1),
                GradientBoostingRegressor(n_estimators=25, random_state=1),
                Ridge(random_state=1)]

        # Friedman1
        X, y = datasets.make_friedman1(n_samples=1200,
                                       random_state=1,
                                       noise=1.0)
        X_train, y_train = X[:200], y[:200]
        X_test, y_test = X[200:], y[200:]

        sr = StackedRegressor(bclf,
                              clfs,
                              n_folds=3,
                              verbose=0,
                              oob_score_flag=True)
        sr.fit(X_train, y_train)
        mse = mean_squared_error(y_test, sr.predict(X_test))
        assert_less(mse, 6.0)
Ejemplo n.º 48
0
    def create_reg_syn_data(self, reps=1, rows=None):
        """Create synthetic data using friedman1 sample generator.

        Returns
        -------
        X : pd.DataFrame
            The input as a pd.DataFrame.
        Y : np.ndarray
            The targets as a ndarray
        Z : Partition
            The partition object holding 5-folds.
        """
        if rows is None:
            rows = 1000
        X, y = make_friedman1(n_samples=rows, random_state=13)
        X = pd.DataFrame(data=X, columns=map(unicode, range(X.shape[1])))
        Y = Response.from_array(y)
        Z = Partition(size=X.shape[0], folds=5, reps=reps,total_size=X.shape[0])
        Z.set(max_reps=reps,max_folds=0)
        X = Container(dataframe=X)
        return X, Y, Z
Ejemplo n.º 49
0
def error_curves(estimator, parameter, parameter_values, n_repeat=100):
    all_train_errors = []
    all_test_errors = []

    for i in range(n_repeat):
        X, y = make_friedman1(n_samples=200)
        X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

        train_errors = []
        test_errors = []

        for j, p in enumerate(parameter_values):
            est = estimator(**{parameter: p})
            est.fit(X_train, y_train)

            train_errors.append(mean_squared_error(y_train, est.predict(X_train)))
            test_errors.append(mean_squared_error(y_test, est.predict(X_test)))

        all_train_errors.append(train_errors)
        all_test_errors.append(test_errors)

    return all_train_errors, all_test_errors
Ejemplo n.º 50
0
from sklearn.utils.testing import assert_raises
from sklearn.utils.testing import assert_raises_regex
from sklearn.utils.testing import assert_allclose
from sklearn.utils.testing import assert_warns_message
from sklearn.utils.testing import assert_no_warnings

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import TransformedTargetRegressor
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Lasso

from sklearn import datasets

friedman = datasets.make_friedman1(random_state=0)


def test_transform_target_regressor_error():
    X, y = friedman
    # provide a transformer and functions at the same time
    regr = TransformedTargetRegressor(regressor=LinearRegression(),
                                      transformer=StandardScaler(),
                                      func=np.exp, inverse_func=np.log)
    assert_raises_regex(ValueError, "'transformer' and functions"
                        " 'func'/'inverse_func' cannot both be set.",
                        regr.fit, X, y)
    # fit with sample_weight with a regressor which does not support it
    sample_weight = np.ones((y.shape[0],))
    regr = TransformedTargetRegressor(regressor=Lasso(),
                                      transformer=StandardScaler())
Ejemplo n.º 51
0
#!/usr/bin/env python

# typical usage
import supylearner as sl
from sklearn import datasets, svm, linear_model, neighbors, svm
import numpy as np

# generate dataset
np.random.seed(100)
X, y = datasets.make_friedman1(1000)

ols = linear_model.LinearRegression()
elnet = linear_model.ElasticNetCV(l1_ratio = .1)
ridge = linear_model.RidgeCV()
lars = linear_model.LarsCV()
lasso = linear_model.LassoCV()
nn = neighbors.KNeighborsRegressor()
svm1 = svm.SVR(kernel = 'rbf') 
svm2 = svm.SVR(kernel = 'poly')
lib = [ols, elnet, ridge,lars, lasso, nn, svm1, svm2]
libnames = ["OLS", "ElasticNet", "Ridge", "LARS", "LASSO", "kNN", "SVM rbf", "SVM poly"]

sl_inst = sl.SuperLearner(lib, libnames, loss = "L2")

sl_inst.fit(X, y)

sl_inst.summarize()

sl.cv_superlearner(sl_inst, X, y, K = 5)

Ejemplo n.º 52
0
def rf_r_test(n=10):
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]
    ens = EnsembleRegressor([RandomForestRegressor(n_estimators=1, max_depth=None, min_samples_split=1, random_state=i) for i in range(n)]).fit(X_train, y_train)
    return RMSE(X_test, y_test, ens)
 def friedman1(n_samples=20000):
     """ Generated data """
     (data, target) = datasets.make_friedman1(n_samples=n_samples)
     return DatasetFactory.Dataset(data=data, target=target)
from sklearn.datasets import make_friedman1
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

from rfs import FFS


for i in n_lst:
    """Return R2 
    
    Return R2 for feature ranking steps in forward feature selection
    """
    selector = FFS(linear, i, step=1, verbose=0)
    selector.fit(X, y)
    score.append(selector.score(X, y))
    
if __name__ == '__main__':
    n_features = 100
    n_samples = 5000
    X, y = make_friedman1(n_samples=n_samples, n_features=n_features, random_state=0)
    linear = LinearRegression()
    score = []
    n_lst = np.arange(1,20,1)
    plt.plot(n_lst, score, label="score")
    #plt.plot(test_sizes, test_error, label="test")
    plt.legend()
    plt.xlabel('number of features selected')
    plt.ylabel('R^2')
    plt.show()
Ejemplo n.º 55
0
def test_vs_linear_model(N=10,M=10,informative=5):
    
    #random.seed(1)
   
    def run(X,y):
        def fit_predict(name):
            lin.fit(X_train,y_train.ravel())
            y_pred = lin.predict(X_test)
            mae = mean_absolute_error(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            alpha = lin.alpha_ if 'alpha_' in dir(lin) else np.nan
            print "%s: mae=%f mse=%f alpha=%f" % (name,mae,mse,alpha)

        X_train = X[:N,:]
        y_train = y[:N]
        X_test = X[N:,:]
        y_test = y[N:]
        alphas = [10*(0.1**i) for i in range(10)]
        print "X_train:",X_train.shape,"X_test:",X_test.shape

        lin = linear_model.RidgeCV(alphas=alphas)
        fit_predict("ridge")
        
        lin = linear_model.LassoCV(alphas=alphas)
        fit_predict("lasso")
        
        lin_r = linear_model.RidgeCV(alphas=alphas).fit(X_train,y_train)
        lin_l = linear_model.LassoCV(alphas=alphas).fit(X_train,y_train)
        lin = LinearMAE(l1=lin_l.alpha_, l2=lin_r.alpha_, verbose=0, opt='CG', maxiter=300)
        fit_predict("LinearMAE")

        lin = RandomForestRegressor(n_estimators=100, 
            max_depth = 12,
            n_jobs = -1,
            verbose = 0, 
            random_state=3465343)
        fit_predict("RFRegressor")
        
        lin = GradientBoostingRegressor(n_estimators=100, 
            loss = 'lad',
            verbose = 0, 
            max_depth = 12,
            learning_rate = 0.1,
            subsample = 1.0,
            random_state=3465343)
        fit_predict("GBRegressor")
    
    #for noise in [0.01,0.1]:
    for noise in [0.1]:
        print "\nLinear Problem: noise=%.2f%%\n========" % (noise*100,)
        a = np.random.sample(M)
        N2 = N*2
        X = np.reshape(np.random.sample(N2*M),(N2,M))
        y = np.dot(X,a) + np.random.sample(N2)*noise
        run(X,y)

        print "\nRegression Problem: noise=%.2f%%\n========" % (noise*100,)
        X,y = make_regression(n_samples=N*2, n_features=M, n_informative=informative,
            n_targets=1, bias=0.0, effective_rank=None, tail_strength=0.5, 
            noise=noise, shuffle=True, coef=False, random_state=None)
        run(X,y)

        print "\nRegression Problem, effective_rank=5 noise=%.2f%%\n========" % (noise*100,)
        X,y = make_regression(n_samples=N*2, n_features=M, n_informative=informative,
            n_targets=1, bias=0.0, effective_rank=5, tail_strength=0.5, 
            noise=noise, shuffle=True, coef=False, random_state=None)
        run(X,y)

        print "\nFriedman1 Problem noise=%.2f%%\n========" % (noise*100,)
        X,y = make_friedman1(n_samples=N*2, n_features=M, noise=noise, random_state=None)
        run(X,y)

def with_best_first(cls, max_leaf_nodes):
    return partial(cls, max_leaf_nodes=max_leaf_nodes)


def uniform_dataset(args):
    X = np.random.random(size=(args.num_examples, args.num_features))
    y = np.random.choice([-1, 1], size=args.num_examples)
    return (X, y)

DATASETS = {
    "uniform": uniform_dataset,
    "hastie": lambda args: datasets.make_hastie_10_2(
        n_samples=args.num_examples),
    "friedman1": lambda args: datasets.make_friedman1(
        n_samples=args.num_examples, n_features=args.num_features),
    "friedman2": lambda args: datasets.make_friedman2(
        n_samples=args.num_examples, noise=args.noise),
    "friedman3": lambda args: datasets.make_friedman3(
        n_samples=args.num_examples, noise=args.noise),
    "make_regression": lambda args: datasets.make_regression(
        n_samples=args.num_examples,
        n_features=args.num_features,
        n_informative=args.num_informative)
}

ENSEMBLE_REGRESSORS = [
    ("GB-D1", with_depth(ensemble.GradientBoostingRegressor, 1)),
    ("GB-D3", with_depth(ensemble.GradientBoostingRegressor, 3)),
    ("GB-B10", with_best_first(ensemble.GradientBoostingRegressor, 10)),
    ("RF-D1", with_depth(ensemble.RandomForestRegressor, 1)),
Ejemplo n.º 57
0
    for t in self.trees:
      np.random.sample()
      t.fit(X, Y)
    
  def predict(self, X):
    Y = []
    trees = self.trees
    for x in X:
      Y.append(median([t.predict_one(x) for t in trees]))
    return Y"""
      
if __name__ == '__main__':
  from sklearn.metrics import r2_score
  from sklearn.datasets import make_friedman1
  from sklearn.tree import DecisionTreeRegressor
  
  X, Y = make_friedman1(10000, 100)
  X_train, Y_train = X[:9000], Y[:9000]
  X_test, Y_test = X[9000:], Y[9000:]
  
  clf = Forest(50, 10, .7, 1)#Regressor(10)
  clf2 = RandomForestRegressor(50, max_depth=10)
  
  clf.fit(X_train, Y_train)
  clf2.fit(X_train, Y_train)
  
  pred = clf.predict(X_test)
  pred2 = clf2.predict(X_test)
  
  print r2_score(Y_test, pred)
  print r2_score(Y_test, pred2)
Ejemplo n.º 58
0
if __name__ == "__main__":

    pdata = np.recfromcsv('/mindhive/gablab/sad/PY_STUDY_DIR/Block/volsurf/l2output/social/split_halves/regression/lsasDELTA/6mm/allsubs.csv',names=True)
    
    subject_num = len(pdata.subject)
    # initialize dependent variable
    y = pdata.lsas_pre-pdata.lsas_post
    ind_variables_num = 4 #if change number here, also modify assignments below (and vice versa)
    # initialize design matrix
    X = np.zeros([subject_num,ind_variables_num])
    X[:,0] = pdata.lsas_pre
    X[:,1] = pdata.classtype-2
    X[:,2] = pdata.age
    X[:,3] = pdata.sex- 1
    print "running FS"
    
    from sklearn.datasets import make_friedman1

    X1, y1 = make_friedman1(n_samples=50, n_features=10, random_state=0)
    estimator = SVR(kernel="linear")
    selector1 = RFE(estimator, 3, step=1)
    selector1 = selector1.fit(X, y)
    
    
    selector, Reg = do_FS(X,y)
    
    
    
    
    
Ejemplo n.º 59
0
# 1.11.4.3. Fitting additional weak-learners



from sklearn.metrics import mean_squared_error
from sklearn.datasets import make_friedman1
from sklearn.ensemble import GradientBoostingRegressor

X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)

X_train, X_test = X[:200], X[200:]
y_train, y_test = y[:200], y[200:]

est = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,
    max_depth=1, random_state=0, loss='ls').fit(X_train, y_train)

_ = est.set_params(n_estimators=200, warm_start=True)  # set warm_start and new nr of trees
_ = est.fit(X_train, y_train) # fit additional 100 trees to est
print mean_squared_error(y_test, est.predict(X_test))    
# 3.84...