Exemple #1
0
def test_regressor():
    X = [[0]] * 4  # ignored
    y = [1, 2, 1, 1]

    reg = DummyRegressor()
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [5. / 4] * len(X))
Exemple #2
0
def mean_model(features, solutions, verbose=0):
    columns = solutions.columns
    clf = DummyRegressor()
    print('Training Model... ')
    clf.fit(features, solutions)
    print('Done Training')
    return (clf, columns)
Exemple #3
0
def test_y_mean_attribute_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]
    # when strategy = 'mean'
    est = DummyRegressor(strategy='mean')
    est.fit(X, y)
    assert_equal(est.y_mean_, np.mean(y))
def train_classifier():
	X_train = tfv.transform(video_captions_train)
	X_test  = tfv.transform(video_captions_test)
	
	dummy = DummyRegressor(strategy="median")
	dummy.fit(X_train, Y_train)
	Y_pred_med = dummy.predict(X_test)
Exemple #5
0
def test_dummy_regressor_on_nan_value():
    X = [[np.NaN]]
    y = [1]
    y_expected = [1]
    clf = DummyRegressor()
    clf.fit(X, y)
    y_pred = clf.predict(X)
    assert_array_equal(y_pred, y_expected)
Exemple #6
0
def test_dummy_regressor_on_3D_array():
    X = np.array([[['foo']], [['bar']], [['baz']]])
    y = np.array([2, 2, 2])
    y_expected = np.array([2, 2, 2])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred = cls.predict(X)
    assert_array_equal(y_pred, y_expected)
Exemple #7
0
class Regressor(BaseEstimator):
    def __init__(self):
        self.clf = DummyRegressor()

    def fit(self, X, y):
        self.clf.fit(X, y)

    def predict(self, X):
        return self.clf.predict(X)
def test_scorer_sample_weight():
    # Test that scorers support sample_weight or raise sensible errors

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0)
    split = train_test_split(X, y, y_ml, random_state=0)
    X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split

    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy="median")
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    estimator = dict(
        [(name, sensible_regr) for name in REGRESSION_SCORERS]
        + [(name, sensible_clf) for name in CLF_SCORERS]
        + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
    )

    for name, scorer in SCORERS.items():
        if name in MULTILABEL_ONLY_SCORERS:
            target = y_ml_test
        else:
            target = y_test
        try:
            weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], target[10:])
            unweighted = scorer(estimator[name], X_test, target)
            assert_not_equal(
                weighted,
                unweighted,
                msg="scorer {0} behaves identically when "
                "called with sample weights: {1} vs "
                "{2}".format(name, weighted, unweighted),
            )
            assert_almost_equal(
                weighted,
                ignored,
                err_msg="scorer {0} behaves differently when "
                "ignoring samples and setting sample_weight to"
                " 0: {1} vs {2}".format(name, weighted, ignored),
            )

        except TypeError as e:
            assert_true(
                "sample_weight" in str(e),
                "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)),
            )
Exemple #9
0
def test_median_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="median")
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
Exemple #10
0
def test_dummy_regressor_return_std():
    X = [[0]] * 3  # ignored
    y = np.array([2, 2, 2])
    y_std_expected = np.array([0, 0, 0])
    cls = DummyRegressor()
    cls.fit(X, y)
    y_pred_list = cls.predict(X, return_std=True)
    # there should be two elements when return_std is True
    assert_equal(len(y_pred_list), 2)
    # the second element should be all zeros
    assert_array_equal(y_pred_list[1], y_std_expected)
Exemple #11
0
def simplest(cube, y, cv):
    """ just use the mean to impute the missing values
    """
    from sklearn.dummy import DummyRegressor
    clf = DummyRegressor()
    X = cube.reshape(cube.shape[0], cube.shape[1] * cube.shape[2])
    sse = np.zeros(y.shape[1])
    for train, test in cv:
        y_train, y_test = y[train], y[test]
        y_predict = clf.fit(X[train], y[train]).predict(X[test])
        sse += np.mean((y_predict - y_test) ** 2, 0)
    return sse
def _make_estimators(X_train, y_train, y_ml_train):
    # Make estimators that make sense to test various scoring methods
    sensible_regr = DummyRegressor(strategy='median')
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier(random_state=0)
    sensible_clf.fit(X_train, y_train)
    sensible_ml_clf = DecisionTreeClassifier(random_state=0)
    sensible_ml_clf.fit(X_train, y_ml_train)
    return dict(
        [(name, sensible_regr) for name in REGRESSION_SCORERS] +
        [(name, sensible_clf) for name in CLF_SCORERS] +
        [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS]
    )
Exemple #13
0
def test_multioutput_regressor():

    X_learn = np.random.randn(10, 10)
    y_learn = np.random.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = np.random.randn(20, 10)
    y_test = np.random.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    assert_array_equal(np.tile(mean, (y_learn.shape[0], 1)), y_pred_learn)
    assert_array_equal(np.tile(mean, (y_test.shape[0], 1)), y_pred_test)
    _check_behavior_2d(est)
Exemple #14
0
def test_mean_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    mean = np.mean(y_learn, axis=0).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor()
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)
Exemple #15
0
def test_regressor_prediction_independent_of_X(strategy):
    y = [0, 2, 1, 1]
    X1 = [[0]] * 4
    reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
    reg1.fit(X1, y)
    predictions1 = reg1.predict(X1)

    X2 = [[1]] * 4
    reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7)
    reg2.fit(X2, y)
    predictions2 = reg2.predict(X2)

    assert_array_equal(predictions1, predictions2)
def test_scorer_sample_weight():
    """Test that scorers support sample_weight or raise sensible errors"""

    # Unlike the metrics invariance test, in the scorer case it's harder
    # to ensure that, on the classifier output, weighted and unweighted
    # scores really should be unequal.
    X, y = make_classification(random_state=0)
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    sample_weight = np.ones_like(y_test)
    sample_weight[:10] = 0

    # get sensible estimators for each metric
    sensible_regr = DummyRegressor(strategy='median')
    sensible_regr.fit(X_train, y_train)
    sensible_clf = DecisionTreeClassifier()
    sensible_clf.fit(X_train, y_train)
    estimator = dict([(name, sensible_regr)
                      for name in REGRESSION_SCORERS] +
                     [(name, sensible_clf)
                      for name in CLF_SCORERS])

    for name, scorer in SCORERS.items():
        try:
            weighted = scorer(estimator[name], X_test, y_test,
                              sample_weight=sample_weight)
            ignored = scorer(estimator[name], X_test[10:], y_test[10:])
            unweighted = scorer(estimator[name], X_test, y_test)
            assert_not_equal(weighted, unweighted,
                             "scorer {0} behaves identically when called with "
                             "sample weights: {1} vs {2}".format(name,
                                                                 weighted,
                                                                 unweighted))
            assert_equal(weighted, ignored,
                         "scorer {0} behaves differently when ignoring "
                         "samples and setting sample_weight to 0: "
                         "{1} vs {2}".format(name, weighted, ignored))

        except TypeError as e:
            assert_true("sample_weight" in str(e),
                        "scorer {0} raises unhelpful exception when called "
                        "with sample weights: {1}".format(name, str(e)))
Exemple #17
0
def _minimize_simbo_general(fun,
                            x0,  # only used to get number of features
                            args=(),
                            callback=None,
                            batch_size=100,
                            population_size=10000,
                            maxiter=10000,
                            scorer=None, # if no scorer given, scores are constant
                            selector=None, # only relevant is sampler is given
                            sampler=None):
    n_iter = int(maxiter / batch_size)
    assert n_iter > 0

    dummy_generator = generative_models.DummyGenerator(len(x0))

    if scorer is None:
        scorer = DummyRegressor()
    if sampler is None:
        sampler = dummy_generator

    if isinstance(selector, float) and 0 < selector < 1:
        selector = percentile_selector(selector)

    for i in range(n_iter):
        if i == 0:
            batch = dummy_generator.sample(batch_size)
        else:
            population = sampler.sample(population_size)
            scores = scorer.predict(population)
            batch_w_score = heapq.nsmallest(batch_size, zip(scores, population),
                                            key=lambda x: x[0])
            batch = [v for score, v in batch_w_score]
        results = optimize_utils.score_multi(fun, batch, args, callback)
        selected = selector(results, batch) if selector is not None else batch
        scorer.fit(batch, results)
        sampler.fit(selected)

    best_fval, best_x = max(zip(results, batch), key=lambda x: x[0])
    nfev = batch_size * n_iter
    return optimize_utils.to_result(x=best_x, fun=best_fval,
                                    niter=n_iter, nfev=nfev)
def test_constant_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    # test with 2d array
    constants = random_state.randn(5)

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="constant", constant=constants)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d_for_constant(est)
def test_weights_regressor():
    """Check weighted average regression prediction on boston dataset."""
    reg1 = DummyRegressor(strategy='mean')
    reg2 = DummyRegressor(strategy='median')
    reg3 = DummyRegressor(strategy='quantile', quantile=.2)
    ereg = VotingRegressor([('mean', reg1), ('median', reg2),
                            ('quantile', reg3)], weights=[1, 2, 10])

    X_r_train, X_r_test, y_r_train, y_r_test = \
        train_test_split(X_r, y_r, test_size=.25)

    reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test)
    reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test)
    reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test)
    ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test)

    avg = np.average(np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0,
                     weights=[1, 2, 10])
    assert_almost_equal(ereg_pred, avg, decimal=2)

    ereg_weights_none = VotingRegressor([('mean', reg1), ('median', reg2),
                                         ('quantile', reg3)], weights=None)
    ereg_weights_equal = VotingRegressor([('mean', reg1), ('median', reg2),
                                          ('quantile', reg3)],
                                         weights=[1, 1, 1])
    ereg_weights_none.fit(X_r_train, y_r_train)
    ereg_weights_equal.fit(X_r_train, y_r_train)
    ereg_none_pred = ereg_weights_none.predict(X_r_test)
    ereg_equal_pred = ereg_weights_equal.predict(X_r_test)
    assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
Exemple #20
0
def test_constant_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="constant", constant=[43])
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [43] * len(X))

    reg = DummyRegressor(strategy="constant", constant=43)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [43] * len(X))
Exemple #21
0
    def test_stacked_featurizer(self):
        data = self.make_test_data()
        data['y'] = [1, 2, 3]

        # Test for a regressor
        model = DummyRegressor()
        model.fit(self.multi.featurize_many(data['x']), data['y'])

        #  Test the predictions
        f = StackedFeaturizer(self.single, model)
        self.assertEqual([2], f.featurize(data['x'][0]))

        #  Test the feature names
        self.assertEqual(['prediction'], f.feature_labels())
        f.name = 'ML'
        self.assertEqual(['ML prediction'], f.feature_labels())

        # Test classifier
        model = DummyClassifier("prior")
        data['y'] = [0, 0, 1]
        model.fit(self.multi.featurize_many(data['x']), data['y'])

        #  Test the prediction
        f.model = model
        self.assertEqual([2. / 3], f.featurize(data['x'][0]))

        #  Test the feature labels
        self.assertRaises(ValueError, f.feature_labels)
        f.class_names = ['A', 'B']
        self.assertEqual(['ML P(A)'], f.feature_labels())

        # Test with three classes
        data['y'] = [0, 2, 1]
        model.fit(self.multi.featurize_many(data['x']), data['y'])

        self.assertArrayAlmostEqual([1. / 3] * 2, f.featurize(data['x'][0]))
        f.class_names = ['A', 'B', 'C']
        self.assertEqual(['ML P(A)', 'ML P(B)'], f.feature_labels())
Exemple #22
0
def test_constant_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    # test with 2d array
    constants = random_state.randn(5)

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="constant", constant=constants)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        constants, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d_for_constant(est)
Exemple #23
0
def test_quantile_invalid():

    X = [[0]] * 5  # ignored
    y = [0] * 5  # ignored

    est = DummyRegressor(strategy="quantile")
    assert_raises(ValueError, est.fit, X, y)

    est = DummyRegressor(strategy="quantile", quantile=None)
    assert_raises(ValueError, est.fit, X, y)

    est = DummyRegressor(strategy="quantile", quantile=[0])
    assert_raises(ValueError, est.fit, X, y)

    est = DummyRegressor(strategy="quantile", quantile=-0.1)
    assert_raises(ValueError, est.fit, X, y)

    est = DummyRegressor(strategy="quantile", quantile=1.1)
    assert_raises(ValueError, est.fit, X, y)

    est = DummyRegressor(strategy="quantile", quantile='abc')
    assert_raises(TypeError, est.fit, X, y)
Exemple #24
0
yelp['hate'] = yelp.text.str.contains('hate', case=False).astype(int)

# add new features to the model
feature_cols = ['cool', 'useful', 'funny', 'length', 'love', 'hate']
X = yelp[feature_cols]
train_test_rmse(X, y)


# TASK 8 (BONUS): compare your best RMSE with RMSE for the null model

# split the data (outside of the function)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# use scikit-learn's built-in dummy regressor
from sklearn.dummy import DummyRegressor
dumb = DummyRegressor(strategy='mean')
dumb.fit(X_train, y_train)
y_dumb = dumb.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_dumb))

# or, create a NumPy array with the right length, and fill it with the mean of y_train
y_null = np.zeros_like(y_test, dtype=float)
y_null.fill(y_train.mean())
print np.sqrt(metrics.mean_squared_error(y_test, y_null))


# TASK 9 (BONUS): treat this as a classification problem, try KNN, maximize your accuracy

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=150)
knn.fit(X_train, y_train)
Exemple #25
0
#data including the lines
test_season = pd.read_csv('data/test_season.csv')
train_season = pd.read_csv('data/train_season.csv')
X_train_s = train_season.drop('GAME_TOTAL', axis = 1).to_numpy()
y_train_s = train_season['GAME_TOTAL'].to_numpy()
X_test_s = test_season.drop('GAME_TOTAL', axis = 1).to_numpy()
y_test_s = test_season['GAME_TOTAL'].to_numpy()
Test_Vegas = test_season['TOTAL_CLOSE'].to_numpy()
Train_Vegas = train_season['TOTAL_CLOSE'].to_numpy()

#Vegas BASELINE = 17.650007402704748 
mean_squared_error(np.append(y_train_s,y_test_s), np.append(Train_Vegas,Test_Vegas), squared = False)

#DUMMY REGRESSOR:

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train_s, y_train_s)
#-0.7833193001644205
dummy_regr.score(X_test_s, y_test_s)
#27.845427872989156
mean_squared_error(y_test_s, dummy_regr.predict(X_test_s), squared = False)

#OLS
regressor = sm.OLS(y_train_s, X_train_s)
regressor = regressor.fit()
#evidently this returned a 0.991 R**2
#second run gave us 0.993
regressor.summary()
preds = regressor.predict(X_test_s)
#18.5802074596655
mean_squared_error(y_test_s, preds, squared = False)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import joblib
import sys
from pathlib import Path
sys.path.append('/home/jiajunb/prosocial-conversations')
from models import XGBOOST_FEATURES, EIGENMETRICS

ROOT_DIR = Path('/shared/0/projects/prosocial/data/finalized/')
train_df = pd.read_csv(ROOT_DIR / 'data_cache/lr_or_xgboost/train.tsv',
                       sep='\t',
                       usecols=XGBOOST_FEATURES + EIGENMETRICS)

train_X = train_df[XGBOOST_FEATURES].values
train_y = train_df[EIGENMETRICS].values.reshape(-1)
dummy_clf = DummyRegressor(strategy="mean")
dummy_clf.fit(train_X, train_y)

# on training set
train_preds = dummy_clf.predict(train_X)
print(f'R^2 on training set: {r2_score(train_y, train_preds)}')
print(f'MSELoss on training set: {mean_squared_error(train_preds, train_y)}')

output_path = ROOT_DIR / 'model_checkpoints/dummy'
output_path.mkdir(exist_ok=True, parents=True)
joblib.dump(dummy_clf, output_path / 'dummy.model.buffer')

test_df = pd.read_csv(ROOT_DIR / 'data_cache/lr_or_xgboost/test.tsv',
                      sep='\t',
                      usecols=XGBOOST_FEATURES + EIGENMETRICS)
test_X = test_df[XGBOOST_FEATURES].values
Exemple #27
0
def DummyPrediction(X_train, y_train, X_test, y_test):
    dummy = DummyRegressor()
    dummy = dummy.fit(X_train, y_train)
    y_pred = dummy.predict(X_test)
    return (y_pred)
Exemple #28
0
def test_regressor_exceptions():
    reg = DummyRegressor()
    assert_raises(ValueError, reg.predict, [])
Exemple #29
0
def test_set_params_nested_pipeline():
    estimator = Pipeline([('a', Pipeline([('b', DummyRegressor())]))])
    estimator.set_params(a__b__alpha=0.001, a__b=Lasso())
    estimator.set_params(a__steps=[('b', LogisticRegression())], a__b__C=5)
def main():

    # load training and testing data set
    print('parsing training set...')
    X_train, y_train = parse('./data_set/train_set.csv')
    print('parsing testing set...')
    X_test, y_test = parse('./data_set/test_set.csv')
    print('train set: ', X_train.shape)
    print('test set: ', X_test.shape)

    # The result turns out to be worse using non-linear polynomial regression
    # convert to polynomial features
    # print('converting to polynomial features...')
    # poly = PolynomialFeatures(2)
    # X_train = poly.fit_transform(X_train)
    # X_test = poly.fit_transform(X_test)
    # print('train set: ', X_train.shape)
    # print('test set: ', X_test.shape)

    # scale the attributes to [0, 1]
    print('standardizing the features...')
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # training classifiers
    print('training, predicting and evaluating...')

    # Dummy Regression (baseline model)
    print('\nDummy Regression: (baseline)')
    model = DummyRegressor(strategy='mean')
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Linear Regression
    print('\nLinear_regression: ')
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # KNN Regression
    # print('\nKNN Regression: ')
    # model = KNeighborsRegressor()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM)
    # print('\nNeural Network - RBM: ')
    # model = BernoulliRBM()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # AdaBoost
    print('\nAdaBoost: ')
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Random Forest
    print('\nRandom Forest:')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))
Exemple #31
0
def test_unknown_strategey_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]

    est = DummyRegressor(strategy='gona')
    assert_raises(ValueError, est.fit, X, y)
def main():

    # load training and testing data set
    print('parsing training set...')
    X_train, y_train = parse('./data_set/train_set.csv')
    print('parsing testing set...')
    X_test, y_test = parse('./data_set/test_set.csv')
    print('train set: ', X_train.shape)
    print('test set: ', X_test.shape)

    # The result turns out to be worse using non-linear polynomial regression
    # convert to polynomial features
    # print('converting to polynomial features...')
    # poly = PolynomialFeatures(2)
    # X_train = poly.fit_transform(X_train)
    # X_test = poly.fit_transform(X_test)
    # print('train set: ', X_train.shape)
    # print('test set: ', X_test.shape)

    # scale the attributes to [0, 1]
    print('standardizing the features...')
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # training classifiers
    print('training, predicting and evaluating...')

    # Dummy Regression (baseline model)
    print('\nDummy Regression: (baseline)')
    model = DummyRegressor(strategy='mean')
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Linear Regression
    print('\nLinear_regression: ')
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # KNN Regression
    # print('\nKNN Regression: ')
    # model = KNeighborsRegressor()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM)
    # print('\nNeural Network - RBM: ')
    # model = BernoulliRBM()
    # model.fit(X_train, y_train)
    # y_pre = model.predict(X_test)
    # print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    # print('r2_score: ', r2_score(y_test, y_pre))

    # AdaBoost
    print('\nAdaBoost: ')
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Random Forest
    print('\nRandom Forest:')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))
Exemple #33
0
    reg_drop = StackingRegressor(estimators=estimators,
                                 final_estimator=rf,
                                 cv=5)

    reg.fit(X_train, y_train)
    reg_drop.fit(X_train, y_train)
    assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
    assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))


@pytest.mark.parametrize(
    "cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)])
@pytest.mark.parametrize("final_estimator, predict_params",
                         [(None, {}),
                          (RandomForestRegressor(random_state=42), {}),
                          (DummyRegressor(), {
                              'return_std': True
                          })])
@pytest.mark.parametrize("passthrough", [False, True])
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
                                     passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=final_estimator,
                            cv=cv,
                            passthrough=passthrough)
Exemple #34
0
from sklearn.metrics import mean_squared_error

# In[35]:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

# # Baseline Model

# In[36]:

from sklearn.dummy import DummyRegressor

dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X_train, y_train)
dummy_regr.predict(X_train)
baseline = dummy_regr.score(X_train, y_train)
print("Baseline R^2: %f" % baseline)

# # Multiple Linear Regression

# In[37]:

ols = linear_model.LinearRegression()
ols.fit(X_train, y_train)
print("Coefficients: %s" % ols.coef_)
print("Intercept: %f" % ols.intercept_)
y_test_prediction = ols.predict(X_test)
ols.score(X_train, y_train)
Exemple #35
0
def test_regressor_score_with_None(y, y_test):
    reg = DummyRegressor()
    reg.fit(None, y)
    assert reg.score(None, y_test) == 1.0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor

diabetes = datasets.load_diabetes()

X = diabetes.data[:, None, 6]
y = diabetes.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lm = LinearRegression().fit(X_train, y_train)
lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train)

y_predict = lm.predict(X_test)
y_predict_dummy_mean = lm_dummy_mean.predict(X_test)

print('Linear model, coefficients: ', lm.coef_)
print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, 
                                                                     y_predict_dummy_mean)))
print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict)))
print("r2_score (dummy): {:.2f}".format(r2_score(y_test, y_predict_dummy_mean)))
print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict)))

# Plot outputs
plt.scatter(X_test, y_test,  color='black')
plt.plot(X_test, y_predict, color='green', linewidth=2)
plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed', 
Exemple #37
0
import csv
import pickle

from sklearn.dummy import DummyRegressor
import numpy as np

age_range = 80
gender = {'male': 0, 'other': 0.5, 'female': 1}

X = np.array([[20 / age_range, gender['male']], [56 / age_range, gender['other']]])
Y = np.array([[.2]        , [.7]])


clf = DummyRegressor()
clf.fit(X, Y)

# print([r[2] for r in data])
print(Y)
# print([
#     movies[int(round(idx * len(movies)))]
#     for idx in clf.predict(X)
# ])
print(clf.predict([[0.2, 1]]))

with open('model.pk', 'wb') as outfile:
    pickle.dump(clf, outfile)
except ImportError:
    # for scikit-learn 0.18 and 0.19
    from sklearn.metrics.scorer import check_scoring

# Regression
ridge = RidgeCV()
svr = SVR(kernel='linear')
# Classification
svc = LinearSVC()
logistic_l1 = LogisticRegression(penalty='l1')
logistic_l2 = LogisticRegression(penalty='l2')
ridge_classifier = RidgeClassifierCV()
random_forest = RandomForestClassifier()

dummy_classifier = DummyClassifier(random_state=0)
dummy_regressor = DummyRegressor()

regressors = {'ridge': (ridge, []),
              'svr': (svr, 'C')}
classifiers = {'svc': (svc, 'C'),
               'logistic_l1': (logistic_l1, 'C'),
               'logistic_l2': (logistic_l2, 'C'),
               'ridge_classifier': (ridge_classifier, [])}
# Create a test dataset
rng = np.random.RandomState(0)
X = rng.rand(100, 10)
# Create different targets
y_regression = rng.rand(100)
y_classification = np.hstack([[-1] * 50, [1] * 50])
y_classification_str = np.hstack([['face'] * 50, ['house'] * 50])
y_multiclass = np.hstack([[0] * 35, [1] * 30, [2] * 35])
Exemple #39
0
# Wczytanie bibliotek.
from sklearn.datasets import load_boston
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split

# Wczytanie danych.
boston = load_boston()

# Utworzenie cech.
features, target = boston.data, boston.target

# Podział na zbiory uczący i testowy.
features_train, features_test, target_train, target_test = train_test_split(
    features, target, random_state=0)

# Utworzenie sztucznego regresora.
dummy = DummyRegressor(strategy='mean')

# "Wytrenowanie" sztucznego regresora.
dummy.fit(features_train, target_train)

# Pobranie kwadratu wartości.
dummy.score(features_test, target_test)
Exemple #40
0
def test_quantile_strategy_regressor():

    random_state = np.random.RandomState(seed=1)

    X = [[0]] * 5  # ignored
    y = random_state.randn(5)

    reg = DummyRegressor(strategy="quantile", quantile=0.5)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.median(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.min(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=1)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.max(y)] * len(X))

    reg = DummyRegressor(strategy="quantile", quantile=0.3)
    reg.fit(X, y)
    assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
Exemple #41
0
 def base_dummy(self):
     model = DummyRegressor(strategy='mean')
     setattr(model, 'data_schema', self.data._X.columns.values)
     setattr(model, 'model_path', 'model_path')
     return model.fit(self.data._X, self.data._y)
Exemple #42
0
def test_quantile_strategy_empty_train():
    est = DummyRegressor(strategy="quantile", quantile=0.4)
    assert_raises(ValueError, est.fit, [], [])
Exemple #43
0
 def __init__(self):
     self.clf = DummyRegressor()
Exemple #44
0
def test_regressor_score_with_None(y, y_test):
    reg = DummyRegressor()
    reg.fit(None, y)
    assert_equal(reg.score(None, y_test), 1.0)
# be significantly imbalanced, and even a simplistic model that would only
# predict mean can achieve an accuracy of 93%.
#
# To evaluate the pertinence of the used metrics, we will consider as a
# baseline a "dummy" estimator that constantly predicts the mean frequency of
# the training sample.

from sklearn.dummy import DummyRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.33, random_state=0)

dummy = Pipeline([
    ("preprocessor", linear_model_preprocessor),
    ("regressor", DummyRegressor(strategy="mean")),
]).fit(df_train,
       df_train["Frequency"],
       regressor__sample_weight=df_train["Exposure"])

##############################################################################
# Let's compute the performance of this constant prediction baseline with 3
# different regression metrics:

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_poisson_deviance


def score_estimator(estimator, df_test):
    """Score an estimator on the test set."""
 def test_init(self):
     regressor = DummyRegressor()
     regressor.fit(numpy.array([[0], [0]]), numpy.array([0.0, 2.0]))
     self.assertEqual(1.0, regressor.constant_)
     regressor_proxy = EstimatorProxy(regressor, attr_names_=["constant_"])
     self.assertEqual(1.0, regressor_proxy.constant_)
Exemple #47
0
def test_quantile_strategy_multioutput_regressor():

    random_state = np.random.RandomState(seed=1)

    X_learn = random_state.randn(10, 10)
    y_learn = random_state.randn(10, 5)

    median = np.median(y_learn, axis=0).reshape((1, -1))
    quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1))

    X_test = random_state.randn(20, 10)
    y_test = random_state.randn(20, 5)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.5)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        median, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)

    # Correctness oracle
    est = DummyRegressor(strategy="quantile", quantile=0.8)
    est.fit(X_learn, y_learn)
    y_pred_learn = est.predict(X_learn)
    y_pred_test = est.predict(X_test)

    _check_equality_regressor(
        quantile_values, y_learn, y_pred_learn, y_test, y_pred_test)
    _check_behavior_2d(est)
def main():

    # read review data
    print('parsing review data...')
    reviews = parse_json('./yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json')

    # use only reviews posted after 2008
    valid_reviews = []
    for review in reviews:
        review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d')
        if review_date.year < 2008: 
            continue
        valid_reviews.append(review)
    reviews = valid_reviews

    # sample the data
    # sample_num = len(reviews)
    # print('sampling...', sample_num, 'out of', len(reviews))
    # reviews = sample(reviews, sample_num)

    # tokenize text for all reviews
    print('tokenizing text for all reviews...')
    texts = [review['text'] for review in reviews]
    count_vect = CountVectorizer(max_features = 100)
    X = count_vect.fit_transform(texts)

    # transform from occurrence to frequency
    print('converting occurrence to frequency...')
    tfidf_transformer = TfidfTransformer()
    X = tfidf_transformer.fit_transform(X)

    # load the linear model for normalization
    clf = joblib.load('./normalization/linear_model_for_normalization.pkl')

    # get labels
    print('calculating labels...')
    y = []
    for review in reviews:
        review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d')
        # normalize
        normalizor = clf.predict(np.array([[review_date.year]]))[0][0]
        review_quality = sum(review['votes'].values()) / normalizor
        y.append(review_quality)

    # splitting into train and test set
    print('splitting into train and test set...')
    train_len = int(X.shape[0] * 0.6)
    X_train = X[:train_len, :]
    y_train = y[:train_len]
    X_test = X[train_len:, :]
    y_test = y[train_len:]
    print('train size:', X_train.shape)
    print('test size:', X_test.shape)

    # convert to polynomial features
    # print('converting to polynomial features...')
    # poly = PolynomialFeatures(2)
    # X_train = poly.fit_transform(X_train.toarray())
    # X_test = poly.fit_transform(X_test.toarray())
    # print('train set: ', X_train.shape)
    # print('test set: ', X_test.shape)

    # scale the attributes to [0, 1]
    print('standardizing the features...')
    min_max_scaler = MinMaxScaler()
    X_train = min_max_scaler.fit_transform(X_train)
    X_test = min_max_scaler.transform(X_test)

    # training classifiers
    print('training, predicting and evaluating...')

    # Dummy Regression (baseline model)
    print('\nDummy Regression:')
    model = DummyRegressor(strategy='mean')
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Linear Regression
    print('\nLinear_regression: ')
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Ridge
    print('\nRidge: ')
    model = Ridge()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # passive aggresive
    print('\nPoly: ')
    model = PassiveAggressiveRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # AdaBoost
    print('\nAdaBoost: ')
    model = AdaBoostRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))

    # Random Forest
    print('\nRandom Forest:')
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    y_pre = model.predict(X_test)
    print('mean absolute error: ', mean_absolute_error(y_test, y_pre))
    print('r2_score: ', r2_score(y_test, y_pre))
Exemple #49
0
print("Percentage of zero claims = {0:%}".format(
    df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / df["Exposure"].sum()))

##############################################################################
# It is worth noting that 92 % of policyholders have zero claims, and if we
# were to convert this problem into a binary classification task, it would be
# significantly imbalanced.
#
# To evaluate the pertinence of the used metrics, we will consider as a
# baseline a "dummy" estimator that constantly predicts the mean frequency of
# the training sample.

df_train, df_test = train_test_split(df, random_state=0)

dummy = make_pipeline(linear_model_preprocessor,
                      DummyRegressor(strategy='mean'))
dummy.fit(df_train,
          df_train["Frequency"],
          dummyregressor__sample_weight=df_train["Exposure"])


def score_estimator(estimator, df_test):
    """Score an estimator on the test set."""

    y_pred = estimator.predict(df_test)

    print(
        "MSE: %.3f" %
        mean_squared_error(df_test["Frequency"], y_pred, df_test["Exposure"]))
    print(
        "MAE: %.3f" %
from sklearn.metrics import mean_absolute_error
from sklearn.dummy import DummyRegressor

nobs = X_meg.shape[0]
max_comps = range(2,30,2)
nfolds=50
cv = ShuffleSplit(nobs,n_iter=nfolds,test_size=.1)


# Trying the prediction with different components
comp_scores = []
dumb_scores = []
for ncomp in max_comps:
    print 'Trying %d components'%ncomp
    pls = PLSRegression(n_components=ncomp)
    dumb = DummyRegressor(strategy='mean')

    mae = 0
    dumb_mae = 0
    for oidx, (train, test) in enumerate(cv):
        X_fmri_train = X_fmri[train]
        X_fmri_test = X_fmri[test]
        X_meg_train = X_meg[train]
        X_meg_test = X_meg[test]
        
        pls.fit(X_fmri_train, X_meg_train)
        pred = pls.predict(X_fmri_test)

        mae += mean_absolute_error(X_meg_test, pred)

        dumb.fit(X_fmri_train, X_meg_train)
Exemple #51
0
def test_constants_not_specified_regressor():
    X = [[0]] * 5
    y = [1, 2, 4, 6, 8]

    est = DummyRegressor(strategy='constant')
    assert_raises(TypeError, est.fit, X, y)
Exemple #52
0
# prepare configuration for cross validation test harness
num_folds = 10
seed = 7
# prepare models
models = []
models.append(('LR', LinearRegression()))
models.append(('Ridge', Ridge()))
#models.append(('ARDRegression', linear_model.ARDRegression()))
models.append(('Lasso', linear_model.Lasso()))
models.append(('LassoCV', linear_model.LassoCV()))
models.append(('LassoLars', linear_model.LassoLars()))
# Decision tree
models.append(('Dec tree', tree.DecisionTreeRegressor()))

# sanity check
models.append(('Dummy', DummyRegressor("median")))


def keras_baseline_model():
    # create model
    model = Sequential()
    model.add(
        Dense(128, input_dim=numFeatures, init='normal', activation='relu'))
    model.add(Dense(1, init='normal', activation="relu"))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model


models.append(('Keras',
               KerasRegressor(build_fn=keras_baseline_model,
Exemple #53
0
	pipeline.configure(**pmml_options)
	if isinstance(regressor, XGBRegressor):
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5)
	else:
		pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params)
	store_pkl(pipeline, name)
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)

if "Auto" in datasets:
	build_auto(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), random_state = 13, n_estimators = 17), "AdaBoostAuto")
	build_auto(ARDRegression(normalize = True), "BayesianARDAuto")
	build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto")
	build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False)
	build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto")
	build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
	build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto")
	build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto")
	build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto")
	build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto")
	build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto")
	build_auto(HistGradientBoostingRegressor(max_iter = 31, random_state = 13), "HistGradientBoostingAuto")
	build_auto(HuberRegressor(), "HuberAuto")
	build_auto(LarsCV(cv = 3), "LarsAuto")
	build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto")
	build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto")
	build_auto(LinearRegression(), "LinearRegressionAuto")
	build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto")
	build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto")
	build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True)
	build_auto(RidgeCV(), "RidgeAuto")
import pandas as pd
from sklearn.dummy import DummyRegressor

# Loading in the data
canucks = pd.read_csv('data/canucks_subbed.csv')

# Define X and y
X = canucks.loc[:, ['No.', 'Age', 'Height', 'Weight', 'Experience']]
y = canucks['Salary']

# Create a model
model = DummyRegressor(strategy="mean")

# Fit your data
model.fit(X, y)

# Predict the labels of X
model.predict(X)

# The model accuracy
accuracy = round(model.score(X, y), 2)

accuracy
Exemple #55
0
df = test_regressor(ExtraTreesRegressor(n_estimators=1000), df)
df = test_regressor(GradientBoostingRegressor(n_estimators=1000), df)
df = test_regressor(RandomForestRegressor(n_estimators=1000), df)
df = test_regressor(GaussianProcessRegressor(), df)
# df = test_regressor(IsotonicRegression(), df) - has errors
df = test_regressor(LinearSVR(), df)
df = test_regressor(NuSVR(), df)
df = test_regressor(SVR(), df)
df = test_regressor(XGBRegressor(n_estimators=1000), df)

df = test_regressor(lgb.LGBMRegressor(n_estimators=1000), df)
df = test_regressor(CatBoostRegressor(n_estimators=1000), df)
df = test_regressor(DecisionTreeRegressor(max_depth=3), df)
df = test_regressor(KNeighborsRegressor(), df)
# df = test_regressor(RadiusNeighborsRegressor(), df) - also has errors
df = test_regressor(DummyRegressor(), df)

df = test_regressor(
    StackingRegressor(regressors=[
        GradientBoostingRegressor(n_estimators=1000),
        HuberRegressor(),
        RidgeCV(cv=5),
        BayesianRidge(compute_score=True, copy_X=True)
    ],
                      meta_regressor=LassoCV(cv=5)), df)

df = test_regressor(
    StackingRegressor(regressors=[
        ElasticNetCV(),
        HuberRegressor(),
        RidgeCV(cv=5),
from sklearn.dummy import DummyRegressor

nobs = X_meg.shape[0]
max_comps = range(5,30,5)
nfolds=50
cv = ShuffleSplit(nobs,n_iter=nfolds,test_size=.1)
y = inatt

# Trying the prediction with different components
comp_scores = []
dumb_scores = []
meg_scores, fmri_scores = [], []
for ncomp in max_comps:
    print 'Trying %d components'%ncomp
    pls = PLSRegression(n_components=ncomp)
    dumb = DummyRegressor(strategy='mean')

    mae = 0
    dumb_mae = 0
    meg_mae, fmri_mae = 0, 0
    for oidx, (train, test) in enumerate(cv):
        X_fmri_train = X_fmri[train]
        X_fmri_test = X_fmri[test]
        X_meg_train = X_meg[train]
        X_meg_test = X_meg[test]
        y_train = y[train]
        y_test = y[test]

        X_train = np.hstack([X_fmri_train,X_meg_train])
        X_test = np.hstack([X_fmri_test,X_meg_test])
        
Exemple #57
0
 def fit(self, X, y):
     self.reg = DummyRegressor()
     return self.reg.fit(X, y)
    # 'gb': GradientBoostingClassifier(),
    'xgb': XGBClassifier(),
    'dummy': DummyClassifier()
}

REGRESSORS = {
    'lr': LinearRegression(),
    'lasso': Lasso(),
    'ridge': Ridge(),
    'mlp': MLPRegressor(),
    'SVC': svm.SVR(),
    'knn': KNeighborsRegressor(),
    'rf': RandomForestRegressor(),
    'gb': GradientBoostingRegressor(),
    'xgb': XGBRegressor(),
    'dummy': DummyRegressor()
}


def spotcheck(estimators=CLASSIFIERS,
              X=None,
              y=None,
              score='roc_auc',
              cv=3,
              sort_by='mean'):
    logging.info("Evaluation metrics: " + str(score))
    results = {}

    @timeit(get_time=True)
    def _eval(clf):
        scores = cross_val_score(estimator=clf, X=X, y=y, scoring=score, cv=cv)