def test_regressor(): X = [[0]] * 4 # ignored y = [1, 2, 1, 1] reg = DummyRegressor() reg.fit(X, y) assert_array_equal(reg.predict(X), [5. / 4] * len(X))
def mean_model(features, solutions, verbose=0): columns = solutions.columns clf = DummyRegressor() print('Training Model... ') clf.fit(features, solutions) print('Done Training') return (clf, columns)
def test_y_mean_attribute_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] # when strategy = 'mean' est = DummyRegressor(strategy='mean') est.fit(X, y) assert_equal(est.y_mean_, np.mean(y))
def train_classifier(): X_train = tfv.transform(video_captions_train) X_test = tfv.transform(video_captions_test) dummy = DummyRegressor(strategy="median") dummy.fit(X_train, Y_train) Y_pred_med = dummy.predict(X_test)
def test_dummy_regressor_on_nan_value(): X = [[np.NaN]] y = [1] y_expected = [1] clf = DummyRegressor() clf.fit(X, y) y_pred = clf.predict(X) assert_array_equal(y_pred, y_expected)
def test_dummy_regressor_on_3D_array(): X = np.array([[['foo']], [['bar']], [['baz']]]) y = np.array([2, 2, 2]) y_expected = np.array([2, 2, 2]) cls = DummyRegressor() cls.fit(X, y) y_pred = cls.predict(X) assert_array_equal(y_pred, y_expected)
class Regressor(BaseEstimator): def __init__(self): self.clf = DummyRegressor() def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
def test_scorer_sample_weight(): # Test that scorers support sample_weight or raise sensible errors # Unlike the metrics invariance test, in the scorer case it's harder # to ensure that, on the classifier output, weighted and unweighted # scores really should be unequal. X, y = make_classification(random_state=0) _, y_ml = make_multilabel_classification(n_samples=X.shape[0], random_state=0) split = train_test_split(X, y, y_ml, random_state=0) X_train, X_test, y_train, y_test, y_ml_train, y_ml_test = split sample_weight = np.ones_like(y_test) sample_weight[:10] = 0 # get sensible estimators for each metric sensible_regr = DummyRegressor(strategy="median") sensible_regr.fit(X_train, y_train) sensible_clf = DecisionTreeClassifier(random_state=0) sensible_clf.fit(X_train, y_train) sensible_ml_clf = DecisionTreeClassifier(random_state=0) sensible_ml_clf.fit(X_train, y_ml_train) estimator = dict( [(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS] + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS] ) for name, scorer in SCORERS.items(): if name in MULTILABEL_ONLY_SCORERS: target = y_ml_test else: target = y_test try: weighted = scorer(estimator[name], X_test, target, sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], target[10:]) unweighted = scorer(estimator[name], X_test, target) assert_not_equal( weighted, unweighted, msg="scorer {0} behaves identically when " "called with sample weights: {1} vs " "{2}".format(name, weighted, unweighted), ) assert_almost_equal( weighted, ignored, err_msg="scorer {0} behaves differently when " "ignoring samples and setting sample_weight to" " 0: {1} vs {2}".format(name, weighted, ignored), ) except TypeError as e: assert_true( "sample_weight" in str(e), "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)), )
def test_median_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="median") reg.fit(X, y) assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
def test_dummy_regressor_return_std(): X = [[0]] * 3 # ignored y = np.array([2, 2, 2]) y_std_expected = np.array([0, 0, 0]) cls = DummyRegressor() cls.fit(X, y) y_pred_list = cls.predict(X, return_std=True) # there should be two elements when return_std is True assert_equal(len(y_pred_list), 2) # the second element should be all zeros assert_array_equal(y_pred_list[1], y_std_expected)
def simplest(cube, y, cv): """ just use the mean to impute the missing values """ from sklearn.dummy import DummyRegressor clf = DummyRegressor() X = cube.reshape(cube.shape[0], cube.shape[1] * cube.shape[2]) sse = np.zeros(y.shape[1]) for train, test in cv: y_train, y_test = y[train], y[test] y_predict = clf.fit(X[train], y[train]).predict(X[test]) sse += np.mean((y_predict - y_test) ** 2, 0) return sse
def _make_estimators(X_train, y_train, y_ml_train): # Make estimators that make sense to test various scoring methods sensible_regr = DummyRegressor(strategy='median') sensible_regr.fit(X_train, y_train) sensible_clf = DecisionTreeClassifier(random_state=0) sensible_clf.fit(X_train, y_train) sensible_ml_clf = DecisionTreeClassifier(random_state=0) sensible_ml_clf.fit(X_train, y_ml_train) return dict( [(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS] + [(name, sensible_ml_clf) for name in MULTILABEL_ONLY_SCORERS] )
def test_multioutput_regressor(): X_learn = np.random.randn(10, 10) y_learn = np.random.randn(10, 5) mean = np.mean(y_learn, axis=0).reshape((1, -1)) X_test = np.random.randn(20, 10) y_test = np.random.randn(20, 5) # Correctness oracle est = DummyRegressor() est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) assert_array_equal(np.tile(mean, (y_learn.shape[0], 1)), y_pred_learn) assert_array_equal(np.tile(mean, (y_test.shape[0], 1)), y_pred_test) _check_behavior_2d(est)
def test_mean_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) mean = np.mean(y_learn, axis=0).reshape((1, -1)) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor() est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est)
def test_regressor_prediction_independent_of_X(strategy): y = [0, 2, 1, 1] X1 = [[0]] * 4 reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7) reg1.fit(X1, y) predictions1 = reg1.predict(X1) X2 = [[1]] * 4 reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7) reg2.fit(X2, y) predictions2 = reg2.predict(X2) assert_array_equal(predictions1, predictions2)
def test_scorer_sample_weight(): """Test that scorers support sample_weight or raise sensible errors""" # Unlike the metrics invariance test, in the scorer case it's harder # to ensure that, on the classifier output, weighted and unweighted # scores really should be unequal. X, y = make_classification(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) sample_weight = np.ones_like(y_test) sample_weight[:10] = 0 # get sensible estimators for each metric sensible_regr = DummyRegressor(strategy='median') sensible_regr.fit(X_train, y_train) sensible_clf = DecisionTreeClassifier() sensible_clf.fit(X_train, y_train) estimator = dict([(name, sensible_regr) for name in REGRESSION_SCORERS] + [(name, sensible_clf) for name in CLF_SCORERS]) for name, scorer in SCORERS.items(): try: weighted = scorer(estimator[name], X_test, y_test, sample_weight=sample_weight) ignored = scorer(estimator[name], X_test[10:], y_test[10:]) unweighted = scorer(estimator[name], X_test, y_test) assert_not_equal(weighted, unweighted, "scorer {0} behaves identically when called with " "sample weights: {1} vs {2}".format(name, weighted, unweighted)) assert_equal(weighted, ignored, "scorer {0} behaves differently when ignoring " "samples and setting sample_weight to 0: " "{1} vs {2}".format(name, weighted, ignored)) except TypeError as e: assert_true("sample_weight" in str(e), "scorer {0} raises unhelpful exception when called " "with sample weights: {1}".format(name, str(e)))
def _minimize_simbo_general(fun, x0, # only used to get number of features args=(), callback=None, batch_size=100, population_size=10000, maxiter=10000, scorer=None, # if no scorer given, scores are constant selector=None, # only relevant is sampler is given sampler=None): n_iter = int(maxiter / batch_size) assert n_iter > 0 dummy_generator = generative_models.DummyGenerator(len(x0)) if scorer is None: scorer = DummyRegressor() if sampler is None: sampler = dummy_generator if isinstance(selector, float) and 0 < selector < 1: selector = percentile_selector(selector) for i in range(n_iter): if i == 0: batch = dummy_generator.sample(batch_size) else: population = sampler.sample(population_size) scores = scorer.predict(population) batch_w_score = heapq.nsmallest(batch_size, zip(scores, population), key=lambda x: x[0]) batch = [v for score, v in batch_w_score] results = optimize_utils.score_multi(fun, batch, args, callback) selected = selector(results, batch) if selector is not None else batch scorer.fit(batch, results) sampler.fit(selected) best_fval, best_x = max(zip(results, batch), key=lambda x: x[0]) nfev = batch_size * n_iter return optimize_utils.to_result(x=best_x, fun=best_fval, niter=n_iter, nfev=nfev)
def test_constant_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) # test with 2d array constants = random_state.randn(5) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="constant", constant=constants) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d_for_constant(est)
def test_weights_regressor(): """Check weighted average regression prediction on boston dataset.""" reg1 = DummyRegressor(strategy='mean') reg2 = DummyRegressor(strategy='median') reg3 = DummyRegressor(strategy='quantile', quantile=.2) ereg = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=[1, 2, 10]) X_r_train, X_r_test, y_r_train, y_r_test = \ train_test_split(X_r, y_r, test_size=.25) reg1_pred = reg1.fit(X_r_train, y_r_train).predict(X_r_test) reg2_pred = reg2.fit(X_r_train, y_r_train).predict(X_r_test) reg3_pred = reg3.fit(X_r_train, y_r_train).predict(X_r_test) ereg_pred = ereg.fit(X_r_train, y_r_train).predict(X_r_test) avg = np.average(np.asarray([reg1_pred, reg2_pred, reg3_pred]), axis=0, weights=[1, 2, 10]) assert_almost_equal(ereg_pred, avg, decimal=2) ereg_weights_none = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=None) ereg_weights_equal = VotingRegressor([('mean', reg1), ('median', reg2), ('quantile', reg3)], weights=[1, 1, 1]) ereg_weights_none.fit(X_r_train, y_r_train) ereg_weights_equal.fit(X_r_train, y_r_train) ereg_none_pred = ereg_weights_none.predict(X_r_test) ereg_equal_pred = ereg_weights_equal.predict(X_r_test) assert_almost_equal(ereg_none_pred, ereg_equal_pred, decimal=2)
def test_constant_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="constant", constant=[43]) reg.fit(X, y) assert_array_equal(reg.predict(X), [43] * len(X)) reg = DummyRegressor(strategy="constant", constant=43) reg.fit(X, y) assert_array_equal(reg.predict(X), [43] * len(X))
def test_stacked_featurizer(self): data = self.make_test_data() data['y'] = [1, 2, 3] # Test for a regressor model = DummyRegressor() model.fit(self.multi.featurize_many(data['x']), data['y']) # Test the predictions f = StackedFeaturizer(self.single, model) self.assertEqual([2], f.featurize(data['x'][0])) # Test the feature names self.assertEqual(['prediction'], f.feature_labels()) f.name = 'ML' self.assertEqual(['ML prediction'], f.feature_labels()) # Test classifier model = DummyClassifier("prior") data['y'] = [0, 0, 1] model.fit(self.multi.featurize_many(data['x']), data['y']) # Test the prediction f.model = model self.assertEqual([2. / 3], f.featurize(data['x'][0])) # Test the feature labels self.assertRaises(ValueError, f.feature_labels) f.class_names = ['A', 'B'] self.assertEqual(['ML P(A)'], f.feature_labels()) # Test with three classes data['y'] = [0, 2, 1] model.fit(self.multi.featurize_many(data['x']), data['y']) self.assertArrayAlmostEqual([1. / 3] * 2, f.featurize(data['x'][0])) f.class_names = ['A', 'B', 'C'] self.assertEqual(['ML P(A)', 'ML P(B)'], f.feature_labels())
def test_constant_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) # test with 2d array constants = random_state.randn(5) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="constant", constant=constants) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor( constants, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d_for_constant(est)
def test_quantile_invalid(): X = [[0]] * 5 # ignored y = [0] * 5 # ignored est = DummyRegressor(strategy="quantile") assert_raises(ValueError, est.fit, X, y) est = DummyRegressor(strategy="quantile", quantile=None) assert_raises(ValueError, est.fit, X, y) est = DummyRegressor(strategy="quantile", quantile=[0]) assert_raises(ValueError, est.fit, X, y) est = DummyRegressor(strategy="quantile", quantile=-0.1) assert_raises(ValueError, est.fit, X, y) est = DummyRegressor(strategy="quantile", quantile=1.1) assert_raises(ValueError, est.fit, X, y) est = DummyRegressor(strategy="quantile", quantile='abc') assert_raises(TypeError, est.fit, X, y)
yelp['hate'] = yelp.text.str.contains('hate', case=False).astype(int) # add new features to the model feature_cols = ['cool', 'useful', 'funny', 'length', 'love', 'hate'] X = yelp[feature_cols] train_test_rmse(X, y) # TASK 8 (BONUS): compare your best RMSE with RMSE for the null model # split the data (outside of the function) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # use scikit-learn's built-in dummy regressor from sklearn.dummy import DummyRegressor dumb = DummyRegressor(strategy='mean') dumb.fit(X_train, y_train) y_dumb = dumb.predict(X_test) print np.sqrt(metrics.mean_squared_error(y_test, y_dumb)) # or, create a NumPy array with the right length, and fill it with the mean of y_train y_null = np.zeros_like(y_test, dtype=float) y_null.fill(y_train.mean()) print np.sqrt(metrics.mean_squared_error(y_test, y_null)) # TASK 9 (BONUS): treat this as a classification problem, try KNN, maximize your accuracy from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=150) knn.fit(X_train, y_train)
#data including the lines test_season = pd.read_csv('data/test_season.csv') train_season = pd.read_csv('data/train_season.csv') X_train_s = train_season.drop('GAME_TOTAL', axis = 1).to_numpy() y_train_s = train_season['GAME_TOTAL'].to_numpy() X_test_s = test_season.drop('GAME_TOTAL', axis = 1).to_numpy() y_test_s = test_season['GAME_TOTAL'].to_numpy() Test_Vegas = test_season['TOTAL_CLOSE'].to_numpy() Train_Vegas = train_season['TOTAL_CLOSE'].to_numpy() #Vegas BASELINE = 17.650007402704748 mean_squared_error(np.append(y_train_s,y_test_s), np.append(Train_Vegas,Test_Vegas), squared = False) #DUMMY REGRESSOR: dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(X_train_s, y_train_s) #-0.7833193001644205 dummy_regr.score(X_test_s, y_test_s) #27.845427872989156 mean_squared_error(y_test_s, dummy_regr.predict(X_test_s), squared = False) #OLS regressor = sm.OLS(y_train_s, X_train_s) regressor = regressor.fit() #evidently this returned a 0.991 R**2 #second run gave us 0.993 regressor.summary() preds = regressor.predict(X_test_s) #18.5802074596655 mean_squared_error(y_test_s, preds, squared = False)
from sklearn.metrics import r2_score from sklearn.metrics import mean_squared_error import joblib import sys from pathlib import Path sys.path.append('/home/jiajunb/prosocial-conversations') from models import XGBOOST_FEATURES, EIGENMETRICS ROOT_DIR = Path('/shared/0/projects/prosocial/data/finalized/') train_df = pd.read_csv(ROOT_DIR / 'data_cache/lr_or_xgboost/train.tsv', sep='\t', usecols=XGBOOST_FEATURES + EIGENMETRICS) train_X = train_df[XGBOOST_FEATURES].values train_y = train_df[EIGENMETRICS].values.reshape(-1) dummy_clf = DummyRegressor(strategy="mean") dummy_clf.fit(train_X, train_y) # on training set train_preds = dummy_clf.predict(train_X) print(f'R^2 on training set: {r2_score(train_y, train_preds)}') print(f'MSELoss on training set: {mean_squared_error(train_preds, train_y)}') output_path = ROOT_DIR / 'model_checkpoints/dummy' output_path.mkdir(exist_ok=True, parents=True) joblib.dump(dummy_clf, output_path / 'dummy.model.buffer') test_df = pd.read_csv(ROOT_DIR / 'data_cache/lr_or_xgboost/test.tsv', sep='\t', usecols=XGBOOST_FEATURES + EIGENMETRICS) test_X = test_df[XGBOOST_FEATURES].values
def DummyPrediction(X_train, y_train, X_test, y_test): dummy = DummyRegressor() dummy = dummy.fit(X_train, y_train) y_pred = dummy.predict(X_test) return (y_pred)
def test_regressor_exceptions(): reg = DummyRegressor() assert_raises(ValueError, reg.predict, [])
def test_set_params_nested_pipeline(): estimator = Pipeline([('a', Pipeline([('b', DummyRegressor())]))]) estimator.set_params(a__b__alpha=0.001, a__b=Lasso()) estimator.set_params(a__steps=[('b', LogisticRegression())], a__b__C=5)
def main(): # load training and testing data set print('parsing training set...') X_train, y_train = parse('./data_set/train_set.csv') print('parsing testing set...') X_test, y_test = parse('./data_set/test_set.csv') print('train set: ', X_train.shape) print('test set: ', X_test.shape) # The result turns out to be worse using non-linear polynomial regression # convert to polynomial features # print('converting to polynomial features...') # poly = PolynomialFeatures(2) # X_train = poly.fit_transform(X_train) # X_test = poly.fit_transform(X_test) # print('train set: ', X_train.shape) # print('test set: ', X_test.shape) # scale the attributes to [0, 1] print('standardizing the features...') min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) # training classifiers print('training, predicting and evaluating...') # Dummy Regression (baseline model) print('\nDummy Regression: (baseline)') model = DummyRegressor(strategy='mean') model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Linear Regression print('\nLinear_regression: ') model = LinearRegression() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # KNN Regression # print('\nKNN Regression: ') # model = KNeighborsRegressor() # model.fit(X_train, y_train) # y_pre = model.predict(X_test) # print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) # print('r2_score: ', r2_score(y_test, y_pre)) # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM) # print('\nNeural Network - RBM: ') # model = BernoulliRBM() # model.fit(X_train, y_train) # y_pre = model.predict(X_test) # print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) # print('r2_score: ', r2_score(y_test, y_pre)) # AdaBoost print('\nAdaBoost: ') model = AdaBoostRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Random Forest print('\nRandom Forest:') model = RandomForestRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre))
def test_unknown_strategey_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] est = DummyRegressor(strategy='gona') assert_raises(ValueError, est.fit, X, y)
reg_drop = StackingRegressor(estimators=estimators, final_estimator=rf, cv=5) reg.fit(X_train, y_train) reg_drop.fit(X_train, y_train) assert_allclose(reg.predict(X_test), reg_drop.predict(X_test)) assert_allclose(reg.transform(X_test), reg_drop.transform(X_test)) @pytest.mark.parametrize( "cv", [3, KFold(n_splits=3, shuffle=True, random_state=42)]) @pytest.mark.parametrize("final_estimator, predict_params", [(None, {}), (RandomForestRegressor(random_state=42), {}), (DummyRegressor(), { 'return_std': True })]) @pytest.mark.parametrize("passthrough", [False, True]) def test_stacking_regressor_diabetes(cv, final_estimator, predict_params, passthrough): # prescale the data to avoid convergence warning without using a pipeline # for later assert X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes), y_diabetes, random_state=42) estimators = [('lr', LinearRegression()), ('svr', LinearSVR())] reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, cv=cv, passthrough=passthrough)
from sklearn.metrics import mean_squared_error # In[35]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # Baseline Model # In[36]: from sklearn.dummy import DummyRegressor dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(X_train, y_train) dummy_regr.predict(X_train) baseline = dummy_regr.score(X_train, y_train) print("Baseline R^2: %f" % baseline) # # Multiple Linear Regression # In[37]: ols = linear_model.LinearRegression() ols.fit(X_train, y_train) print("Coefficients: %s" % ols.coef_) print("Intercept: %f" % ols.intercept_) y_test_prediction = ols.predict(X_test) ols.score(X_train, y_train)
def test_regressor_score_with_None(y, y_test): reg = DummyRegressor() reg.fit(None, y) assert reg.score(None, y_test) == 1.0
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from sklearn.dummy import DummyRegressor diabetes = datasets.load_diabetes() X = diabetes.data[:, None, 6] y = diabetes.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) lm = LinearRegression().fit(X_train, y_train) lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train) y_predict = lm.predict(X_test) y_predict_dummy_mean = lm_dummy_mean.predict(X_test) print('Linear model, coefficients: ', lm.coef_) print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, y_predict_dummy_mean))) print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict))) print("r2_score (dummy): {:.2f}".format(r2_score(y_test, y_predict_dummy_mean))) print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict))) # Plot outputs plt.scatter(X_test, y_test, color='black') plt.plot(X_test, y_predict, color='green', linewidth=2) plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed',
import csv import pickle from sklearn.dummy import DummyRegressor import numpy as np age_range = 80 gender = {'male': 0, 'other': 0.5, 'female': 1} X = np.array([[20 / age_range, gender['male']], [56 / age_range, gender['other']]]) Y = np.array([[.2] , [.7]]) clf = DummyRegressor() clf.fit(X, Y) # print([r[2] for r in data]) print(Y) # print([ # movies[int(round(idx * len(movies)))] # for idx in clf.predict(X) # ]) print(clf.predict([[0.2, 1]])) with open('model.pk', 'wb') as outfile: pickle.dump(clf, outfile)
except ImportError: # for scikit-learn 0.18 and 0.19 from sklearn.metrics.scorer import check_scoring # Regression ridge = RidgeCV() svr = SVR(kernel='linear') # Classification svc = LinearSVC() logistic_l1 = LogisticRegression(penalty='l1') logistic_l2 = LogisticRegression(penalty='l2') ridge_classifier = RidgeClassifierCV() random_forest = RandomForestClassifier() dummy_classifier = DummyClassifier(random_state=0) dummy_regressor = DummyRegressor() regressors = {'ridge': (ridge, []), 'svr': (svr, 'C')} classifiers = {'svc': (svc, 'C'), 'logistic_l1': (logistic_l1, 'C'), 'logistic_l2': (logistic_l2, 'C'), 'ridge_classifier': (ridge_classifier, [])} # Create a test dataset rng = np.random.RandomState(0) X = rng.rand(100, 10) # Create different targets y_regression = rng.rand(100) y_classification = np.hstack([[-1] * 50, [1] * 50]) y_classification_str = np.hstack([['face'] * 50, ['house'] * 50]) y_multiclass = np.hstack([[0] * 35, [1] * 30, [2] * 35])
# Wczytanie bibliotek. from sklearn.datasets import load_boston from sklearn.dummy import DummyRegressor from sklearn.model_selection import train_test_split # Wczytanie danych. boston = load_boston() # Utworzenie cech. features, target = boston.data, boston.target # Podział na zbiory uczący i testowy. features_train, features_test, target_train, target_test = train_test_split( features, target, random_state=0) # Utworzenie sztucznego regresora. dummy = DummyRegressor(strategy='mean') # "Wytrenowanie" sztucznego regresora. dummy.fit(features_train, target_train) # Pobranie kwadratu wartości. dummy.score(features_test, target_test)
def test_quantile_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="quantile", quantile=0.5) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.median(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=0) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.min(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=1) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.max(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=0.3) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
def base_dummy(self): model = DummyRegressor(strategy='mean') setattr(model, 'data_schema', self.data._X.columns.values) setattr(model, 'model_path', 'model_path') return model.fit(self.data._X, self.data._y)
def test_quantile_strategy_empty_train(): est = DummyRegressor(strategy="quantile", quantile=0.4) assert_raises(ValueError, est.fit, [], [])
def __init__(self): self.clf = DummyRegressor()
def test_regressor_score_with_None(y, y_test): reg = DummyRegressor() reg.fit(None, y) assert_equal(reg.score(None, y_test), 1.0)
# be significantly imbalanced, and even a simplistic model that would only # predict mean can achieve an accuracy of 93%. # # To evaluate the pertinence of the used metrics, we will consider as a # baseline a "dummy" estimator that constantly predicts the mean frequency of # the training sample. from sklearn.dummy import DummyRegressor from sklearn.pipeline import Pipeline from sklearn.model_selection import train_test_split df_train, df_test = train_test_split(df, test_size=0.33, random_state=0) dummy = Pipeline([ ("preprocessor", linear_model_preprocessor), ("regressor", DummyRegressor(strategy="mean")), ]).fit(df_train, df_train["Frequency"], regressor__sample_weight=df_train["Exposure"]) ############################################################################## # Let's compute the performance of this constant prediction baseline with 3 # different regression metrics: from sklearn.metrics import mean_squared_error from sklearn.metrics import mean_absolute_error from sklearn.metrics import mean_poisson_deviance def score_estimator(estimator, df_test): """Score an estimator on the test set."""
def test_init(self): regressor = DummyRegressor() regressor.fit(numpy.array([[0], [0]]), numpy.array([0.0, 2.0])) self.assertEqual(1.0, regressor.constant_) regressor_proxy = EstimatorProxy(regressor, attr_names_=["constant_"]) self.assertEqual(1.0, regressor_proxy.constant_)
def test_quantile_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) median = np.median(y_learn, axis=0).reshape((1, -1)) quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1)) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="quantile", quantile=0.5) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor( median, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est) # Correctness oracle est = DummyRegressor(strategy="quantile", quantile=0.8) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor( quantile_values, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est)
def main(): # read review data print('parsing review data...') reviews = parse_json('./yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json') # use only reviews posted after 2008 valid_reviews = [] for review in reviews: review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d') if review_date.year < 2008: continue valid_reviews.append(review) reviews = valid_reviews # sample the data # sample_num = len(reviews) # print('sampling...', sample_num, 'out of', len(reviews)) # reviews = sample(reviews, sample_num) # tokenize text for all reviews print('tokenizing text for all reviews...') texts = [review['text'] for review in reviews] count_vect = CountVectorizer(max_features = 100) X = count_vect.fit_transform(texts) # transform from occurrence to frequency print('converting occurrence to frequency...') tfidf_transformer = TfidfTransformer() X = tfidf_transformer.fit_transform(X) # load the linear model for normalization clf = joblib.load('./normalization/linear_model_for_normalization.pkl') # get labels print('calculating labels...') y = [] for review in reviews: review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d') # normalize normalizor = clf.predict(np.array([[review_date.year]]))[0][0] review_quality = sum(review['votes'].values()) / normalizor y.append(review_quality) # splitting into train and test set print('splitting into train and test set...') train_len = int(X.shape[0] * 0.6) X_train = X[:train_len, :] y_train = y[:train_len] X_test = X[train_len:, :] y_test = y[train_len:] print('train size:', X_train.shape) print('test size:', X_test.shape) # convert to polynomial features # print('converting to polynomial features...') # poly = PolynomialFeatures(2) # X_train = poly.fit_transform(X_train.toarray()) # X_test = poly.fit_transform(X_test.toarray()) # print('train set: ', X_train.shape) # print('test set: ', X_test.shape) # scale the attributes to [0, 1] print('standardizing the features...') min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) # training classifiers print('training, predicting and evaluating...') # Dummy Regression (baseline model) print('\nDummy Regression:') model = DummyRegressor(strategy='mean') model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Linear Regression print('\nLinear_regression: ') model = LinearRegression() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Ridge print('\nRidge: ') model = Ridge() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # passive aggresive print('\nPoly: ') model = PassiveAggressiveRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # AdaBoost print('\nAdaBoost: ') model = AdaBoostRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Random Forest print('\nRandom Forest:') model = RandomForestRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre))
print("Percentage of zero claims = {0:%}".format( df.loc[df["ClaimNb"] == 0, "Exposure"].sum() / df["Exposure"].sum())) ############################################################################## # It is worth noting that 92 % of policyholders have zero claims, and if we # were to convert this problem into a binary classification task, it would be # significantly imbalanced. # # To evaluate the pertinence of the used metrics, we will consider as a # baseline a "dummy" estimator that constantly predicts the mean frequency of # the training sample. df_train, df_test = train_test_split(df, random_state=0) dummy = make_pipeline(linear_model_preprocessor, DummyRegressor(strategy='mean')) dummy.fit(df_train, df_train["Frequency"], dummyregressor__sample_weight=df_train["Exposure"]) def score_estimator(estimator, df_test): """Score an estimator on the test set.""" y_pred = estimator.predict(df_test) print( "MSE: %.3f" % mean_squared_error(df_test["Frequency"], y_pred, df_test["Exposure"])) print( "MAE: %.3f" %
from sklearn.metrics import mean_absolute_error from sklearn.dummy import DummyRegressor nobs = X_meg.shape[0] max_comps = range(2,30,2) nfolds=50 cv = ShuffleSplit(nobs,n_iter=nfolds,test_size=.1) # Trying the prediction with different components comp_scores = [] dumb_scores = [] for ncomp in max_comps: print 'Trying %d components'%ncomp pls = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') mae = 0 dumb_mae = 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] pls.fit(X_fmri_train, X_meg_train) pred = pls.predict(X_fmri_test) mae += mean_absolute_error(X_meg_test, pred) dumb.fit(X_fmri_train, X_meg_train)
def test_constants_not_specified_regressor(): X = [[0]] * 5 y = [1, 2, 4, 6, 8] est = DummyRegressor(strategy='constant') assert_raises(TypeError, est.fit, X, y)
# prepare configuration for cross validation test harness num_folds = 10 seed = 7 # prepare models models = [] models.append(('LR', LinearRegression())) models.append(('Ridge', Ridge())) #models.append(('ARDRegression', linear_model.ARDRegression())) models.append(('Lasso', linear_model.Lasso())) models.append(('LassoCV', linear_model.LassoCV())) models.append(('LassoLars', linear_model.LassoLars())) # Decision tree models.append(('Dec tree', tree.DecisionTreeRegressor())) # sanity check models.append(('Dummy', DummyRegressor("median"))) def keras_baseline_model(): # create model model = Sequential() model.add( Dense(128, input_dim=numFeatures, init='normal', activation='relu')) model.add(Dense(1, init='normal', activation="relu")) # Compile model model.compile(loss='mean_squared_error', optimizer='adam') return model models.append(('Keras', KerasRegressor(build_fn=keras_baseline_model,
pipeline.configure(**pmml_options) if isinstance(regressor, XGBRegressor): pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params, precision = 1e-5, zeroThreshold = 1e-5) else: pipeline.verify(auto_X.sample(frac = 0.05, random_state = 13), predict_params = predict_params) store_pkl(pipeline, name) mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"]) store_csv(mpg, name) if "Auto" in datasets: build_auto(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), random_state = 13, n_estimators = 17), "AdaBoostAuto") build_auto(ARDRegression(normalize = True), "BayesianARDAuto") build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False) build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy = "median"), "DummyAuto") build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto") build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto") build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto") build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto") build_auto(HistGradientBoostingRegressor(max_iter = 31, random_state = 13), "HistGradientBoostingAuto") build_auto(HuberRegressor(), "HuberAuto") build_auto(LarsCV(cv = 3), "LarsAuto") build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto") build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto") build_auto(LinearRegression(), "LinearRegressionAuto") build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto") build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto") build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True) build_auto(RidgeCV(), "RidgeAuto")
import pandas as pd from sklearn.dummy import DummyRegressor # Loading in the data canucks = pd.read_csv('data/canucks_subbed.csv') # Define X and y X = canucks.loc[:, ['No.', 'Age', 'Height', 'Weight', 'Experience']] y = canucks['Salary'] # Create a model model = DummyRegressor(strategy="mean") # Fit your data model.fit(X, y) # Predict the labels of X model.predict(X) # The model accuracy accuracy = round(model.score(X, y), 2) accuracy
df = test_regressor(ExtraTreesRegressor(n_estimators=1000), df) df = test_regressor(GradientBoostingRegressor(n_estimators=1000), df) df = test_regressor(RandomForestRegressor(n_estimators=1000), df) df = test_regressor(GaussianProcessRegressor(), df) # df = test_regressor(IsotonicRegression(), df) - has errors df = test_regressor(LinearSVR(), df) df = test_regressor(NuSVR(), df) df = test_regressor(SVR(), df) df = test_regressor(XGBRegressor(n_estimators=1000), df) df = test_regressor(lgb.LGBMRegressor(n_estimators=1000), df) df = test_regressor(CatBoostRegressor(n_estimators=1000), df) df = test_regressor(DecisionTreeRegressor(max_depth=3), df) df = test_regressor(KNeighborsRegressor(), df) # df = test_regressor(RadiusNeighborsRegressor(), df) - also has errors df = test_regressor(DummyRegressor(), df) df = test_regressor( StackingRegressor(regressors=[ GradientBoostingRegressor(n_estimators=1000), HuberRegressor(), RidgeCV(cv=5), BayesianRidge(compute_score=True, copy_X=True) ], meta_regressor=LassoCV(cv=5)), df) df = test_regressor( StackingRegressor(regressors=[ ElasticNetCV(), HuberRegressor(), RidgeCV(cv=5),
from sklearn.dummy import DummyRegressor nobs = X_meg.shape[0] max_comps = range(5,30,5) nfolds=50 cv = ShuffleSplit(nobs,n_iter=nfolds,test_size=.1) y = inatt # Trying the prediction with different components comp_scores = [] dumb_scores = [] meg_scores, fmri_scores = [], [] for ncomp in max_comps: print 'Trying %d components'%ncomp pls = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') mae = 0 dumb_mae = 0 meg_mae, fmri_mae = 0, 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] y_train = y[train] y_test = y[test] X_train = np.hstack([X_fmri_train,X_meg_train]) X_test = np.hstack([X_fmri_test,X_meg_test])
def fit(self, X, y): self.reg = DummyRegressor() return self.reg.fit(X, y)
# 'gb': GradientBoostingClassifier(), 'xgb': XGBClassifier(), 'dummy': DummyClassifier() } REGRESSORS = { 'lr': LinearRegression(), 'lasso': Lasso(), 'ridge': Ridge(), 'mlp': MLPRegressor(), 'SVC': svm.SVR(), 'knn': KNeighborsRegressor(), 'rf': RandomForestRegressor(), 'gb': GradientBoostingRegressor(), 'xgb': XGBRegressor(), 'dummy': DummyRegressor() } def spotcheck(estimators=CLASSIFIERS, X=None, y=None, score='roc_auc', cv=3, sort_by='mean'): logging.info("Evaluation metrics: " + str(score)) results = {} @timeit(get_time=True) def _eval(clf): scores = cross_val_score(estimator=clf, X=X, y=y, scoring=score, cv=cv)