def test_quantile_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) median = np.median(y_learn, axis=0).reshape((1, -1)) quantile_values = np.percentile(y_learn, axis=0, q=80).reshape((1, -1)) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="quantile", quantile=0.5) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor( median, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est) # Correctness oracle est = DummyRegressor(strategy="quantile", quantile=0.8) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor( quantile_values, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est)
def test_regressor_prediction_independent_of_X(strategy): y = [0, 2, 1, 1] X1 = [[0]] * 4 reg1 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7) reg1.fit(X1, y) predictions1 = reg1.predict(X1) X2 = [[1]] * 4 reg2 = DummyRegressor(strategy=strategy, constant=0, quantile=0.7) reg2.fit(X2, y) predictions2 = reg2.predict(X2) assert_array_equal(predictions1, predictions2)
def test_constant_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="constant", constant=[43]) reg.fit(X, y) assert_array_equal(reg.predict(X), [43] * len(X)) reg = DummyRegressor(strategy="constant", constant=43) reg.fit(X, y) assert_array_equal(reg.predict(X), [43] * len(X))
def test_regressor(): X = [[0]] * 4 # ignored y = [1, 2, 1, 1] reg = DummyRegressor() reg.fit(X, y) assert_array_equal(reg.predict(X), [5. / 4] * len(X))
def train_classifier(): X_train = tfv.transform(video_captions_train) X_test = tfv.transform(video_captions_test) dummy = DummyRegressor(strategy="median") dummy.fit(X_train, Y_train) Y_pred_med = dummy.predict(X_test)
def test_dummy_regressor_on_3D_array(): X = np.array([[['foo']], [['bar']], [['baz']]]) y = np.array([2, 2, 2]) y_expected = np.array([2, 2, 2]) cls = DummyRegressor() cls.fit(X, y) y_pred = cls.predict(X) assert_array_equal(y_pred, y_expected)
def test_dummy_regressor_on_nan_value(): X = [[np.NaN]] y = [1] y_expected = [1] clf = DummyRegressor() clf.fit(X, y) y_pred = clf.predict(X) assert_array_equal(y_pred, y_expected)
class Regressor(BaseEstimator): def __init__(self): self.clf = DummyRegressor() def fit(self, X, y): self.clf.fit(X, y) def predict(self, X): return self.clf.predict(X)
def test_multioutput_regressor(): X_learn = np.random.randn(10, 10) y_learn = np.random.randn(10, 5) mean = np.mean(y_learn, axis=0).reshape((1, -1)) X_test = np.random.randn(20, 10) y_test = np.random.randn(20, 5) # Correctness oracle est = DummyRegressor() est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) assert_array_equal(np.tile(mean, (y_learn.shape[0], 1)), y_pred_learn) assert_array_equal(np.tile(mean, (y_test.shape[0], 1)), y_pred_test) _check_behavior_2d(est)
class PerfectClassifierMeanRegressor(): def fit(self,X: pd.DataFrame, y: pd.Series): self.X = X self.y = y self.regressor = DummyRegressor(strategy='mean') def cross_val(self,scoring,k=10): self.scores = {} for name, scorer in scoring.items(): for split in ['train','test']: self.scores[split+'_'+name] = [] splitter = KFold(n_splits=k,shuffle=True,random_state=7) for train_index, test_index in splitter.split(self.X,self.y): X_train = self.X.values[train_index] y_train = self.y.values[train_index] X_test = self.X.values[test_index] y_test = self.y.values[test_index] # get test y class labels for perfect classification y_test_binary = (y_test != 0) y_train_binary = (y_train != 0) self.regressor.fit(X_train,y_train.reshape(-1,1)) reg_pred_test = self.regressor.predict(X_test).flatten() reg_pred_train = self.regressor.predict(X_train).flatten() y_pred_test = np.multiply(y_test_binary,reg_pred_test) y_pred_train = np.multiply(y_train_binary,reg_pred_train) for name, scorer in scoring.items(): self.scores['test_'+name].append(scorer(y_test,y_pred_test)) self.scores['train_'+name].append(scorer(y_train,y_pred_train)) return self.scores def get_params(self): return(self.regressor.get_params())
def test_mean_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) mean = np.mean(y_learn, axis=0).reshape((1, -1)) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor() est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est)
def test_multioutput_regressor(): X_learn = np.random.randn(10, 10) y_learn = np.random.randn(10, 5) mean = np.mean(y_learn, axis=0).reshape((1, -1)) X_test = np.random.randn(20, 10) y_test = np.random.randn(20, 5) # Correctness oracle est = DummyRegressor() est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) assert_array_equal(np.tile(mean, (y_learn.shape[0], 1)), y_pred_learn) assert_array_equal(np.tile(mean, (y_test.shape[0], 1)), y_pred_test) _check_behavior_2d(est)
def test_median_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="median") reg.fit(X, y) assert_array_equal(reg.predict(X), [np.median(y)] * len(X))
class DummyEstimator(BaseTesterEstimator): def __init__(self): self.regressor = DummyRegressor() def fit(self, x, y): self.regressor.fit(x, y) def predict(self, x): return self.regressor.predict(x)
def test_mean_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 4 # ignored y = random_state.randn(4) reg = DummyRegressor() reg.fit(X, y) assert_array_equal(reg.predict(X), [np.mean(y)] * len(X))
def test_dummy_regressor_return_std(): X = [[0]] * 3 # ignored y = np.array([2, 2, 2]) y_std_expected = np.array([0, 0, 0]) cls = DummyRegressor() cls.fit(X, y) y_pred_list = cls.predict(X, return_std=True) # there should be two elements when return_std is True assert_equal(len(y_pred_list), 2) # the second element should be all zeros assert_array_equal(y_pred_list[1], y_std_expected)
def test_dummy_regressor_return_std(): X = [[0]] * 3 # ignored y = np.array([2, 2, 2]) y_std_expected = np.array([0, 0, 0]) cls = DummyRegressor() cls.fit(X, y) y_pred_list = cls.predict(X, return_std=True) # there should be two elements when return_std is True assert len(y_pred_list) == 2 # the second element should be all zeros assert_array_equal(y_pred_list[1], y_std_expected)
def test_mean_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) mean = np.mean(y_learn, axis=0).reshape((1, -1)) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor() est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(mean, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d(est)
def test_quantile_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="quantile", quantile=0.5) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.median(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=0) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.min(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=1) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.max(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=0.3) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
def test_constant_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) # test with 2d array constants = random_state.randn(5) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="constant", constant=constants) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d_for_constant(est)
def test_constant_strategy_multioutput_regressor(): random_state = np.random.RandomState(seed=1) X_learn = random_state.randn(10, 10) y_learn = random_state.randn(10, 5) # test with 2d array constants = random_state.randn(5) X_test = random_state.randn(20, 10) y_test = random_state.randn(20, 5) # Correctness oracle est = DummyRegressor(strategy="constant", constant=constants) est.fit(X_learn, y_learn) y_pred_learn = est.predict(X_learn) y_pred_test = est.predict(X_test) _check_equality_regressor(constants, y_learn, y_pred_learn, y_test, y_pred_test) _check_behavior_2d_for_constant(est)
def test_quantile_strategy_regressor(): random_state = np.random.RandomState(seed=1) X = [[0]] * 5 # ignored y = random_state.randn(5) reg = DummyRegressor(strategy="quantile", quantile=0.5) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.median(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=0) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.min(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=1) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.max(y)] * len(X)) reg = DummyRegressor(strategy="quantile", quantile=0.3) reg.fit(X, y) assert_array_equal(reg.predict(X), [np.percentile(y, q=30)] * len(X))
class Mean: """Finds the mean of all the days in the history and uses them as the prediction""" def __init__(self, batch_size=1): self.batch_size = batch_size self.regressor_ = DummyRegressor(strategy="mean") def fit(self, X, y=None): self.regressor_.fit(X, y) def predict(self, X): predictions = self.regressor_.predict(X) predictions = np.round(predictions / self.batch_size) * self.batch_size return np.expand_dims(predictions, -1)
def get_mean_reg(trait_name, metric_name, x, y): x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=0) # Create baseline regressor (always predicts mean) from sklearn.dummy import DummyRegressor mean_regressor = DummyRegressor(strategy='mean') mean_regressor.fit(x_train, y_train) evaluate(y_test, mean_regressor.predict(x_test), "Baseline Regressor") return mean_regressor
def naive_model_compare_r2(X_tr, y_tr, X_te, y_te, y_pr): # Model print('--- model: {:.3}'.format(metrics.r2_score(y_te, y_pr))) # normal random distribution y_pr_rand = np.random.normal(0, 1, y_pr.shape) print('--- normal random distribution: {:.3}'\ .format(metrics.r2_score(y_te, y_pr_rand))) # dummy regressors for s in ['mean', 'median']: dum = DummyRegressor(strategy=s).fit(X_tr, y_tr) y_pr_dum = dum.predict(X_te) print('--- dummy regressor ('+ s +') : r2_score={:.3}'\ .format(metrics.r2_score(y_te, y_pr_dum)))
class SVMTotal: def __init__(self, features, model_name, kernel: str=None, degree: int=None): if 'svm' in str.lower(model_name): self.model = SVR(gamma='scale', kernel=kernel, degree=degree) elif 'average' in str.lower(model_name): self.model = DummyRegressor(strategy='mean') elif 'median' in str.lower(model_name): self.model = DummyRegressor(strategy='median') elif 'per_raisha_baseline' in str.lower(model_name): self.per_raisha = None else: logging.error('Model name not in: svm, average, median') print('Model name not in: svm, average, median') raise Exception('Model name not in: svm, average, median') self.features = features self.model_name = model_name def fit(self, train_x: pd.DataFrame, train_y: pd.Series): if 'per_raisha_baseline' in str.lower(self.model_name): train_y.name = 'labels' train_x = train_x.merge(train_y, right_index=True, left_index=True) self.per_raisha = pd.DataFrame(train_x.groupby(by='raisha').labels.mean()) self.per_raisha.columns = ['predictions'] else: train_x = train_x[self.features] self.model = self.model.fit(train_x, train_y) def predict(self, validation_x: pd.DataFrame, validation_y: pd.Series): if 'per_raisha_baseline' in str.lower(self.model_name): validation_x = validation_x.merge(self.per_raisha, left_on='raisha', right_index=True) validation_x.index = validation_x.sample_id predictions = validation_x.predictions else: validation_x = validation_x[self.features] predictions = self.model.predict(validation_x) validation_y.name = 'labels' predictions = pd.Series(predictions, index=validation_y.index, name='predictions') if predictions.dtype == float: # regression- create bins to measure the F-score bin_prediction, bin_test_y = utils.create_bin_columns(predictions, validation_y) four_bin_prediction, four_bin_test_y = utils.create_4_bin_columns(predictions, validation_y) else: bin_prediction, bin_test_y = pd.Series(name='bin_prediction'), pd.Series(name='bin_label') four_bin_prediction, four_bin_test_y =\ pd.Series(name='four_bin_prediction'), pd.Series(name='four_bin_label') predictions = pd.DataFrame(predictions).join(validation_y).join(bin_test_y).join(bin_prediction) predictions = predictions.join(four_bin_test_y).join(four_bin_prediction) return predictions
def run_dummy_regressor(train_embeds, train_targets, test_embeds, test_targets, scaler=None): dummy = DummyRegressor(strategy="median") dummy.fit(train_embeds, train_targets) if scaler == None: rmse = mean_squared_error(test_targets, dummy.predict(test_embeds), squared=False) else: rmse = mean_squared_error(test_targets, dummy.predict(test_embeds), squared=False) rmse = scaler.inverse_transform(np.array(rmse).reshape(1, -1))[0][0] print('Dummy regressor RMSE:', rmse) print( 'Dummy regressor MAPE:', mean_absolute_percentage_error(test_targets, dummy.predict(test_embeds))) print('Dummy regressor R2:', r2_score(test_targets, dummy.predict(test_embeds)))
def dummy_regression(x_train, y_train, x_test, y_test, strategy): """ Regression strategies : case 1 : y_pred_random = np.random.randint(np.min(y), np.max(y), y_test.shape) case 2 : strategy is 'mean' or 'median' cf : - https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html """ dum = DummyRegressor(strategy) dum.fit(x_train, y_train) y_predicted_dum = dum.predict(x_test) # np.sqrt(metrics.mean_squared_error(y_test, y_predicted_dum)) RMSE return y_predicted_dum, y_test
class r07546035_DummyRegression(regression): def trainAlgo(self): self.model = DummyRegressor(strategy=self.param['strategy'], quantile=self.param['quantile']) self.model.fit(self.inputData['X'], self.outputData['y'], sample_weight=None) def predictAlgo(self): self.result['y'] = self.model.predict(self.inputData['X']) def get_paramsAlgo(self): self.model.get_params(self)
def dummy_train_test(self, strategy='mean'): clf = DummyRegressor(strategy=strategy) ''' “mean”: always predicts the mean of the training set “median”: always predicts the median of the training set “quantile”: always predicts a specified quantile of the training set, provided with the quantile parameter. “constant”: always predicts a constant value that is provided by the user. ''' clf.fit(self.X_train, self.y_train) y_pred = clf.predict(self.X_test) result_dic = {} result_dic['mse'] = round(mean_squared_error(y_pred=y_pred, y_true=self.y_test), 4) result_dic['mae'] = round(mean_absolute_error(y_pred=y_pred, y_true=self.y_test), 4) return result_dic
def _minimize_simbo_general(fun, x0, # only used to get number of features args=(), callback=None, batch_size=100, population_size=10000, maxiter=10000, scorer=None, # if no scorer given, scores are constant selector=None, # only relevant is sampler is given sampler=None): n_iter = int(maxiter / batch_size) assert n_iter > 0 dummy_generator = generative_models.DummyGenerator(len(x0)) if scorer is None: scorer = DummyRegressor() if sampler is None: sampler = dummy_generator if isinstance(selector, float) and 0 < selector < 1: selector = percentile_selector(selector) for i in range(n_iter): if i == 0: batch = dummy_generator.sample(batch_size) else: population = sampler.sample(population_size) scores = scorer.predict(population) batch_w_score = heapq.nsmallest(batch_size, zip(scores, population), key=lambda x: x[0]) batch = [v for score, v in batch_w_score] results = optimize_utils.score_multi(fun, batch, args, callback) selected = selector(results, batch) if selector is not None else batch scorer.fit(batch, results) sampler.fit(selected) best_fval, best_x = max(zip(results, batch), key=lambda x: x[0]) nfev = batch_size * n_iter return optimize_utils.to_result(x=best_x, fun=best_fval, niter=n_iter, nfev=nfev)
def testLinearRegression(x, y): # Mean for Linear Regression model using all of the data for training. model = LinearRegression().fit(x, y) ypred = model.predict(x) lgError = mean_squared_error(y,ypred) # Average mean when using k-folds for training and testing. kFoldError = kFoldLinearRegression(x,y) # Mean when using dummy model with all of the data used for training. dummy = DummyRegressor(strategy="mean").fit(X=x, y=y) dummy_preds = dummy.predict(x) dummyError = mean_squared_error(y,dummy_preds) # Mean when using an 80:20 train:test split for Linear Regression. X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2) model = LinearRegression().fit(X_train, y_train) preds = model.predict(X_test) splitError = mean_squared_error(y_test,preds) print("LG k-fold mean squared error: %f, baseline square error: %f, LG not-folded error: %f, LG 80:20 error: %f" %(kFoldError, dummyError, lgError, splitError))
def do_polynomial_reg(type_id, model_class, degree, alpha=None): matrix = access.item_matrix(type_id) # item id of abyssal magstab data = np.array(matrix) x = data[:, 1:] y = data[:, 0] kf = KFold(n_splits=5) mean_error = [] dummy_mses = [] Xpoly = PolynomialFeatures(degree=int(degree)).fit_transform(x) if alpha is not None: model = model_class(normalize=True, alpha=alpha) else: model = model_class(normalize=True) temp = [] for train, test in kf.split(Xpoly): model.fit(Xpoly[train], y[train]) ypred = model.predict(Xpoly[test]) temp.append(mean_squared_error(y[test], ypred)) dummy_model = DummyRegressor(strategy='mean') dummy_model.fit(Xpoly[train], y[train]) dummy_pred = dummy_model.predict(Xpoly[test]) mean_error.append(mean_squared_error(y[test], ypred)) dummy_mses.append(mean_squared_error(y[test], dummy_pred)) mse = np.array(mean_error).mean() dummy_mse = np.array(dummy_mses).mean() print( f'{model_class.__name__} w/ Polynomial Features Mean Squared Error: {mse}' ) print(f'Dummy Classifier (Mean) Mean Squared Error: {dummy_mse}') print(f'Percentage Difference: {((mse - dummy_mse)/mse) * 100}%')
def main(): # read review data print('parsing review data...') reviews = parse_json('./yelp_dataset_challenge_academic_dataset/yelp_academic_dataset_review.json') # use only reviews posted after 2008 valid_reviews = [] for review in reviews: review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d') if review_date.year < 2008: continue valid_reviews.append(review) reviews = valid_reviews # sample the data # sample_num = len(reviews) # print('sampling...', sample_num, 'out of', len(reviews)) # reviews = sample(reviews, sample_num) # tokenize text for all reviews print('tokenizing text for all reviews...') texts = [review['text'] for review in reviews] count_vect = CountVectorizer(max_features = 100) X = count_vect.fit_transform(texts) # transform from occurrence to frequency print('converting occurrence to frequency...') tfidf_transformer = TfidfTransformer() X = tfidf_transformer.fit_transform(X) # load the linear model for normalization clf = joblib.load('./normalization/linear_model_for_normalization.pkl') # get labels print('calculating labels...') y = [] for review in reviews: review_date = datetime.datetime.strptime(review['date'], '%Y-%m-%d') # normalize normalizor = clf.predict(np.array([[review_date.year]]))[0][0] review_quality = sum(review['votes'].values()) / normalizor y.append(review_quality) # splitting into train and test set print('splitting into train and test set...') train_len = int(X.shape[0] * 0.6) X_train = X[:train_len, :] y_train = y[:train_len] X_test = X[train_len:, :] y_test = y[train_len:] print('train size:', X_train.shape) print('test size:', X_test.shape) # convert to polynomial features # print('converting to polynomial features...') # poly = PolynomialFeatures(2) # X_train = poly.fit_transform(X_train.toarray()) # X_test = poly.fit_transform(X_test.toarray()) # print('train set: ', X_train.shape) # print('test set: ', X_test.shape) # scale the attributes to [0, 1] print('standardizing the features...') min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) # training classifiers print('training, predicting and evaluating...') # Dummy Regression (baseline model) print('\nDummy Regression:') model = DummyRegressor(strategy='mean') model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Linear Regression print('\nLinear_regression: ') model = LinearRegression() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Ridge print('\nRidge: ') model = Ridge() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # passive aggresive print('\nPoly: ') model = PassiveAggressiveRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # AdaBoost print('\nAdaBoost: ') model = AdaBoostRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Random Forest print('\nRandom Forest:') model = RandomForestRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre))
grades_in = ['NU_NOTA_CH','NU_NOTA_LC', 'NU_NOTA_CN','NU_NOTA_REDACAO'] grade_out = 'NU_NOTA_MT' data_noNA = data_no0[tests].dropna() x = data_noNA[grades_in] y = data_noNA[grade_out] seed = 4321 x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=seed) model_linear = LinearSVR(random_state=seed) model_linear.fit(x_train, y_train) predictions_linear = model_linear.predict(x_test) model_svr = SVR() model_svr.fit(x_train, y_train) predictions_svr = model_svr.predict(x_test) model_dummy = DummyRegressor() model_dummy.fit(x_train, y_train) predictions_dummy = model_dummy.predict(x_test) linear = mean_squared_error(y_test, predictions_linear)**(1/2) svr = mean_squared_error(y_test, predictions_svr)**(1/2) dummy = mean_squared_error(y_test, predictions_dummy)**(1/2) print(f'-=- Mean error of models -=-\nlinear: {linear}, svr: {svr}, dummy: {dummy}') linear = r2_score(y_test, predictions_linear) svr = r2_score(y_test, predictions_svr) dummy = r2_score(y_test, predictions_dummy) print(f'-=- R2 of models -=-\nlinear: {linear}, svr: {svr}, dummy: {dummy}')
pls = PLSRegression(n_components=ncomp) dumb = DummyRegressor(strategy='mean') mae = 0 dumb_mae = 0 for oidx, (train, test) in enumerate(cv): X_fmri_train = X_fmri[train] X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] pls.fit(X_fmri_train, X_meg_train) pred = pls.predict(X_fmri_test) mae += mean_absolute_error(X_meg_test, pred) dumb.fit(X_fmri_train, X_meg_train) dumb_pred = dumb.predict(X_fmri_test) dumb_mae += mean_absolute_error(X_meg_test,dumb_pred) comp_scores.append(mae/nfolds) dumb_scores.append(dumb_mae/nfolds) import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt plt.plot(max_comps,comp_scores,max_comps,dumb_scores) t_str = seed + str(band) plt.title(t_str) plt.savefig(home+'/tmp/meg_fmri_%s_%s.png'%(seed,band[0]))
X_fmri_test = X_fmri[test] X_meg_train = X_meg[train] X_meg_test = X_meg[test] y_train = y[train] y_test = y[test] X_train = np.hstack([X_fmri_train,X_meg_train]) X_test = np.hstack([X_fmri_test,X_meg_test]) pls.fit(X_train, y_train) pred = pls.predict(X_test) mae += mean_absolute_error(y_test, pred) dumb.fit(X_train, y_train) dumb_pred = dumb.predict(X_test) dumb_mae += mean_absolute_error(y_test,dumb_pred) if within: pls.fit(X_fmri_train, y_train) pred = pls.predict(X_fmri_test) fmri_mae += mean_absolute_error(y_test, pred) pls.fit(X_meg_train, y_train) pred = pls.predict(X_meg_test) meg_mae += mean_absolute_error(y_test, pred) comp_scores.append(mae/nfolds) dumb_scores.append(dumb_mae/nfolds) fmri_scores.append(fmri_mae/nfolds) meg_scores.append(meg_mae/nfolds)
def main(): # load training and testing data set print('parsing training set...') X_train, y_train = parse('./data_set/train_set.csv') print('parsing testing set...') X_test, y_test = parse('./data_set/test_set.csv') print('train set: ', X_train.shape) print('test set: ', X_test.shape) # The result turns out to be worse using non-linear polynomial regression # convert to polynomial features # print('converting to polynomial features...') # poly = PolynomialFeatures(2) # X_train = poly.fit_transform(X_train) # X_test = poly.fit_transform(X_test) # print('train set: ', X_train.shape) # print('test set: ', X_test.shape) # scale the attributes to [0, 1] print('standardizing the features...') min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) # training classifiers print('training, predicting and evaluating...') # Dummy Regression (baseline model) print('\nDummy Regression: (baseline)') model = DummyRegressor(strategy='mean') model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Linear Regression print('\nLinear_regression: ') model = LinearRegression() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # KNN Regression # print('\nKNN Regression: ') # model = KNeighborsRegressor() # model.fit(X_train, y_train) # y_pre = model.predict(X_test) # print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) # print('r2_score: ', r2_score(y_test, y_pre)) # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM) # print('\nNeural Network - RBM: ') # model = BernoulliRBM() # model.fit(X_train, y_train) # y_pre = model.predict(X_test) # print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) # print('r2_score: ', r2_score(y_test, y_pre)) # AdaBoost print('\nAdaBoost: ') model = AdaBoostRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Random Forest print('\nRandom Forest:') model = RandomForestRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre))
import csv import pickle from sklearn.dummy import DummyRegressor import numpy as np age_range = 80 gender = {'male': 0, 'other': 0.5, 'female': 1} X = np.array([[20 / age_range, gender['male']], [56 / age_range, gender['other']]]) Y = np.array([[.2] , [.7]]) clf = DummyRegressor() clf.fit(X, Y) # print([r[2] for r in data]) print(Y) # print([ # movies[int(round(idx * len(movies)))] # for idx in clf.predict(X) # ]) print(clf.predict([[0.2, 1]])) with open('model.pk', 'wb') as outfile: pickle.dump(clf, outfile)
X_test_s = test_season.drop('GAME_TOTAL', axis = 1).to_numpy() y_test_s = test_season['GAME_TOTAL'].to_numpy() Test_Vegas = test_season['TOTAL_CLOSE'].to_numpy() Train_Vegas = train_season['TOTAL_CLOSE'].to_numpy() #Vegas BASELINE = 17.650007402704748 mean_squared_error(np.append(y_train_s,y_test_s), np.append(Train_Vegas,Test_Vegas), squared = False) #DUMMY REGRESSOR: dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(X_train_s, y_train_s) #-0.7833193001644205 dummy_regr.score(X_test_s, y_test_s) #27.845427872989156 mean_squared_error(y_test_s, dummy_regr.predict(X_test_s), squared = False) #OLS regressor = sm.OLS(y_train_s, X_train_s) regressor = regressor.fit() #evidently this returned a 0.991 R**2 #second run gave us 0.993 regressor.summary() preds = regressor.predict(X_test_s) #18.5802074596655 mean_squared_error(y_test_s, preds, squared = False) #RANDOM FOREST rf = RandomForestRegressor(oob_score=True) rf.fit(X_train_s,y_train_s) #0.23057109964613554
from pathlib import Path sys.path.append('/home/jiajunb/prosocial-conversations') from models import XGBOOST_FEATURES, EIGENMETRICS ROOT_DIR = Path('/shared/0/projects/prosocial/data/finalized/') train_df = pd.read_csv(ROOT_DIR / 'data_cache/lr_or_xgboost/train.tsv', sep='\t', usecols=XGBOOST_FEATURES + EIGENMETRICS) train_X = train_df[XGBOOST_FEATURES].values train_y = train_df[EIGENMETRICS].values.reshape(-1) dummy_clf = DummyRegressor(strategy="mean") dummy_clf.fit(train_X, train_y) # on training set train_preds = dummy_clf.predict(train_X) print(f'R^2 on training set: {r2_score(train_y, train_preds)}') print(f'MSELoss on training set: {mean_squared_error(train_preds, train_y)}') output_path = ROOT_DIR / 'model_checkpoints/dummy' output_path.mkdir(exist_ok=True, parents=True) joblib.dump(dummy_clf, output_path / 'dummy.model.buffer') test_df = pd.read_csv(ROOT_DIR / 'data_cache/lr_or_xgboost/test.tsv', sep='\t', usecols=XGBOOST_FEATURES + EIGENMETRICS) test_X = test_df[XGBOOST_FEATURES].values test_y = test_df[EIGENMETRICS].values # on test set test_preds = dummy_clf.predict(test_X)
O MSE, sigla em inglês para essá métrica, é uma medida que quanto mais perto de zero melhor. Veja o resultado quando calculamos o MSE de dois vetores iguais: """ mean_squared_error(y_teste, y_teste) """Nosso resultado é zero! Você deve estar se perguntando: meu modelo não está nem perto de zero, será que ele é tão ruim assim? Nós ainda não temos como te dar essa resposta, precisamos de um critério comparativo, pois assim conseguimos dizer como nosso modelo está indo. Por exemplo, que tal classificar os nossos dados de uma maneira "bobinha"? Para isso temos os chamados métodos **Dummy**. """ from sklearn.dummy import DummyRegressor modelo_dummy = DummyRegressor() modelo_dummy.fit(x_treino, y_treino) dummy_predicoes = modelo_dummy.predict(x_teste) mean_squared_error(y_teste, dummy_predicoes) """Finalmente conseguimos responder se nosso modelo é tão ruim assim! Na realidade nosso modelo não é um dos melhores, temos muito o que melhorar, mas já somos melhores que uma classificação ingênua. Com isso, encerramos nossa última aula. Espero que vocês tenham gostado! Participem também do nosso **desafio final, valendo um Nintendo Switch**. Bons estudos e boa sorte! Forte abraço! ## Desafio 1 da [Allan Spadini](https://twitter.com/allanspadini)
from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from sklearn.dummy import DummyRegressor diabetes = datasets.load_diabetes() X = diabetes.data[:, None, 6] y = diabetes.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) lm = LinearRegression().fit(X_train, y_train) lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train, y_train) y_predict = lm.predict(X_test) y_predict_dummy_mean = lm_dummy_mean.predict(X_test) print('Linear model, coefficients: ', lm.coef_) print("Mean squared error (dummy): {:.2f}".format(mean_squared_error(y_test, y_predict_dummy_mean))) print("Mean squared error (linear model): {:.2f}".format(mean_squared_error(y_test, y_predict))) print("r2_score (dummy): {:.2f}".format(r2_score(y_test, y_predict_dummy_mean))) print("r2_score (linear model): {:.2f}".format(r2_score(y_test, y_predict))) # Plot outputs plt.scatter(X_test, y_test, color='black') plt.plot(X_test, y_predict, color='green', linewidth=2) plt.plot(X_test, y_predict_dummy_mean, color='red', linestyle = 'dashed', linewidth=2, label = 'dummy') plt.show()
# add new features to the model feature_cols = ['cool', 'useful', 'funny', 'length', 'love', 'hate'] X = yelp[feature_cols] train_test_rmse(X, y) # TASK 8 (BONUS): compare your best RMSE with RMSE for the null model # split the data (outside of the function) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # use scikit-learn's built-in dummy regressor from sklearn.dummy import DummyRegressor dumb = DummyRegressor(strategy='mean') dumb.fit(X_train, y_train) y_dumb = dumb.predict(X_test) print np.sqrt(metrics.mean_squared_error(y_test, y_dumb)) # or, create a NumPy array with the right length, and fill it with the mean of y_train y_null = np.zeros_like(y_test, dtype=float) y_null.fill(y_train.mean()) print np.sqrt(metrics.mean_squared_error(y_test, y_null)) # TASK 9 (BONUS): treat this as a classification problem, try KNN, maximize your accuracy from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=150) knn.fit(X_train, y_train) y_pred_class = knn.predict(X_test) print metrics.accuracy_score(y_test, y_pred_class)
def main(): # load training and testing data set print('parsing training set...') X_train, y_train = parse('./data_set/train_set.csv') print('parsing testing set...') X_test, y_test = parse('./data_set/test_set.csv') print('train set: ', X_train.shape) print('test set: ', X_test.shape) # The result turns out to be worse using non-linear polynomial regression # convert to polynomial features # print('converting to polynomial features...') # poly = PolynomialFeatures(2) # X_train = poly.fit_transform(X_train) # X_test = poly.fit_transform(X_test) # print('train set: ', X_train.shape) # print('test set: ', X_test.shape) # scale the attributes to [0, 1] print('standardizing the features...') min_max_scaler = MinMaxScaler() X_train = min_max_scaler.fit_transform(X_train) X_test = min_max_scaler.transform(X_test) # training classifiers print('training, predicting and evaluating...') # Dummy Regression (baseline model) print('\nDummy Regression: (baseline)') model = DummyRegressor(strategy='mean') model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Linear Regression print('\nLinear_regression: ') model = LinearRegression() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # KNN Regression # print('\nKNN Regression: ') # model = KNeighborsRegressor() # model.fit(X_train, y_train) # y_pre = model.predict(X_test) # print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) # print('r2_score: ', r2_score(y_test, y_pre)) # Neural Network - Bernoulli Restricted Boltzmann Machine (RBM) # print('\nNeural Network - RBM: ') # model = BernoulliRBM() # model.fit(X_train, y_train) # y_pre = model.predict(X_test) # print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) # print('r2_score: ', r2_score(y_test, y_pre)) # AdaBoost print('\nAdaBoost: ') model = AdaBoostRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre)) # Random Forest print('\nRandom Forest:') model = RandomForestRegressor() model.fit(X_train, y_train) y_pre = model.predict(X_test) print('mean absolute error: ', mean_absolute_error(y_test, y_pre)) print('r2_score: ', r2_score(y_test, y_pre))
model = LinearRegression() model.fit(X_train, y_train) # verifichiamo l'apprendimento sui dati di training pred_train = model.predict(X_train) train_score = mean_squared_error(y_train, pred_train) print('Train error', train_score) # testiamo il modello sui dati di test pred_test = model.predict(X_test) test_score = mean_squared_error(y_test, pred_test) print('Test error', test_score) # prestazioni del modello dummy (baseline) dummy = DummyRegressor() dummy.fit(X_train, y_train) # dummy sul training set pred_train_dummy = dummy.predict(X_train) dummy_train_score = mean_squared_error(y_train, pred_train_dummy) print('Dummy train error', dummy_train_score) # dummy sul test set pred_test_dummy = dummy.predict(X_test) dummy_test_score = mean_squared_error(y_test, pred_test_dummy) print('Dummy test error', dummy_test_score) # prepariamo dati per il grafico di confronto report = { 'train': [dummy_train_score, train_score], 'test' : [dummy_test_score, test_score], 'model': ['dummy', 'regression'] } report_df = pd.DataFrame(report) report_df = report_df.set_index(report_df['model'])
# In[35]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # # Baseline Model # In[36]: from sklearn.dummy import DummyRegressor dummy_regr = DummyRegressor(strategy="mean") dummy_regr.fit(X_train, y_train) dummy_regr.predict(X_train) baseline = dummy_regr.score(X_train, y_train) print("Baseline R^2: %f" % baseline) # # Multiple Linear Regression # In[37]: ols = linear_model.LinearRegression() ols.fit(X_train, y_train) print("Coefficients: %s" % ols.coef_) print("Intercept: %f" % ols.intercept_) y_test_prediction = ols.predict(X_test) ols.score(X_train, y_train) # In[40]:
def DummyPrediction(X_train, y_train, X_test, y_test): dummy = DummyRegressor() dummy = dummy.fit(X_train, y_train) y_pred = dummy.predict(X_test) return (y_pred)
import pandas as pd from sklearn.dummy import DummyRegressor # Loading in the data canucks = pd.read_csv('data/canucks_subbed.csv') # Define X and y X = canucks.loc[:, ['No.', 'Age', 'Height', 'Weight', 'Experience']] y = canucks['Salary'] # Create a model model = DummyRegressor(strategy="mean") # Fit your data model.fit(X, y) # Predict the labels of X model.predict(X) # The model accuracy accuracy = round(model.score(X, y), 2) accuracy
def test_regressor_scatter(): """Tests regressor scatter.""" X, y = load_boston(return_X_y=True) estimator = DummyRegressor() estimator.fit(X, y) regressor_scatter(X, y, estimator.predict(X), 'regressor_scatter.pdf')