def validate(nPrev, nAfter, aux_temp, aux_sun, aux_prec, get_model=False): X_Final = getFeature(nPrev, nAfter, aux_temp, aux_sun, aux_prec, TrainFiles) data_train_target = pd.read_csv(TrainTarget, sep='\t', header=None) y = data_train_target.loc[:,0].values TEST_SIZE = 0.2 RANDOM_STATE = 0 X_train, X_val, y_train, y_val = train_test_split(X_Final, y, test_size=TEST_SIZE, random_state=RANDOM_STATE) imp.fit(X_train) X_train = imp.transform(X_train) imp.fit(X_val) X_val = imp.transform(X_val) reg = RidgeCV() reg.fit(X_train, y_train) y_val_pred = reg.predict(X_val) print mean_squared_error(y_val, y_val_pred) if get_model: imp.fit(X_Final) X_Final = imp.transform(X_Final) reg_submit = RidgeCV() reg_submit.fit(X_Final, y) return reg_submit return mean_squared_error(y_val, y_val_pred)
def ridge_predict(train_data, train_target, test_data): # Prep modeller alpha_ranges = [1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 2e3, 2.5e3, 3e3, 3.5e3, 4e3, 5e3, 6e3, 6.1e3, 6.15e3, 6.25e3, 6.3e3, 6.4e3, 7e3, 7.75e3, 7.9e3, 8e3, 8.1e3, 8.2e3, 8.25e3, 8.3e3, 8.4e3, 8.5e3, 8.75e3, 9e3, 9.25e3, 9.4e3, 9.5e3, 9.6e3, 9.75e3, 1e4, 1.25e4, 1.4e4, 1.5e4, 1.55e4, 1.58e4, 1.6e4, 1.625e4, 1.65e4, 1.7e4, 1.725e4, 1.74e4, 1.75e4, 1.76e4, 1.78e4, 1.85e4, 2e4, 2.25e4, 2.5e4, 3e4, 4e4, 0.5e5, 0.75e5, 1e5, 1.25e5, 1.5e5, 0.8e6, 0.9e6, 1e6, 1.1e6, 1.2e6, 1.25e6, 1.28e6, 1.3e6, 1.32e6, 1.33e6, 1.34e6, 1.4e6, 1.5e6, 2e6, 1e7, 1e8, 1e9, 5e9, 1e10, 5e10, 1e11, 1e12, 1e13] clf = RidgeCV(alphas=alpha_ranges, normalize=True, cv=None, fit_intercept=False, store_cv_values=True) # Fit clf.fit(train_data, train_target) # print("alpha range:", alpha_ranges) # print("CV per alpha:",np.mean(clf.cv_values_, axis=0)) # print("alpha used:", clf.alpha_) # print("fit score:", clf.score(train_data, train_target)) # Prediction predictions = clf.predict(test_data) return predictions
def regularizedreg(Xtrain,Xtest,ytrain,ytest): Rclf = RidgeCV(alphas=[1,2,20,40,50]) # RidgeCV(alphas=[0.1, 1.0, 2.0, 4.0, 20.0], cv=None, fit_intercept=True, scoring=None, normalize=False) Rclf.fit(Xtrain,ytrain); print("Residual sum of squares: %.2f" % np.mean((Rclf.predict(Xtest) - ytest) ** 2)) print('Regularization choosen, alpha = %.2f' % Rclf.alpha_); print(' Coef values = ', Rclf.coef_); print('Variance score: %.2f' % Rclf.score(Xtest, ytest))
def fit_Ridge(features_train, labels_train, features_pred, alphas=(0.1, 1.0, 10.0)): model = RidgeCV(normalize=True, store_cv_values=True, alphas=alphas) model.fit(features_train, labels_train) cv_errors = np.mean(model.cv_values_, axis=0) print "RIDGE - CV error min: ", np.min(cv_errors) # Test the model labels_pred = model.predict(features_pred) return labels_pred
def create_firststage_preds(train, valid, testing): """ This handles the first stage of a true stacking procedure using random forests to create first stage predictions in the train, test, and validation. Splits train into two sections, run random forest on both and predicts from one half into other (and visa versa). Then random forest is run on whole model and predicted into both validation and test. """ np.random.seed(42) # Get vector of de-dupped values of ids id_dat = pd.DataFrame(train["tube_assembly_id"].drop_duplicates()) # Create random vector to split train val on vect_len = len(id_dat.ix[:, 0]) id_dat["rand_vals"] = np.array(np.random.rand(vect_len, 1)) df = pd.merge(train, id_dat, on="tube_assembly_id") # Create model for both halves of df frst1 = RandomForestRegressor(n_estimators=300, n_jobs=7) is_first_half = df.rand_vals > 0.5 is_scnd_half = df.rand_vals < 0.5 frst1.fit(df.ix[is_first_half, feats], df.ix[is_first_half, "target"]) frst2 = RandomForestRegressor(n_estimators=300, n_jobs=7) frst2.fit(df.ix[is_scnd_half, feats], df.ix[is_scnd_half, "target"]) # Predict frst1 onto forst2 data set and visa versa train["forest"] = 0 train["forest"][is_scnd_half] = frst1.predict(df.ix[is_scnd_half, feats]) train["forest"][is_first_half] = frst2.predict(df.ix[is_first_half, feats]) # Create forest in full data for validation and test frst = RandomForestRegressor(n_estimators=300, n_jobs=7) frst.fit(df[feats], df.target) valid["forest"] = frst.predict(valid[feats]) testing["forest"] = frst.predict(testing[feats]) # Create model for both halves of df rdg1 = RidgeCV(alphas=[0.5, 0.75, 1, 1.25]) rdg2 = RidgeCV(alphas=[0.5, 0.75, 1, 1.25]) rdg1.fit(df.ix[is_first_half, feats], df.ix[is_first_half, "target"]) rdg2.fit(df.ix[is_scnd_half, feats], df.ix[is_scnd_half, "target"]) # Predict frst1 onto forst2 data set and visa versa train["ridge"] = 0 train["ridge"][is_scnd_half] = rdg1.predict(df.ix[is_scnd_half, feats]) train["ridge"][is_first_half] = rdg2.predict(df.ix[is_first_half, feats]) # Create forest in full data for validation and test rdg = RidgeCV(alphas=[0.5, 0.75, 1, 1.25]) rdg.fit(df[feats], df.target) valid["ridge"] = rdg.predict(valid[feats]) testing["ridge"] = rdg.predict(testing[feats])
def ensemble(Method,alphas,blend_train, blend_test, Y_dev, Y_test, n_folds): if (Method==1): bclf = RidgeCV(alphas=alphas, normalize=True, cv=n_folds) bclf.fit(blend_train, Y_dev) print ("Best alpha = ", bclf.alpha_) Y_test_predict = bclf.predict(blend_test) elif(Method==2): bclf = ElasticNetCV(alphas=alphas, normalize=True, cv=n_folds) bclf.fit(blend_train, Y_dev) print ("Best alpha = ", bclf.alpha_) Y_test_predict = bclf.predict(blend_test) else: bclf = LassoCV(alphas=alphas, normalize=True, cv=n_folds) bclf.fit(blend_train, Y_dev) print ("Best alpha = ", bclf.alpha_) Y_test_predict = bclf.predict(blend_test) score1 = metrics.mean_absolute_error(Y_test, Y_test_predict) score = normalized_gini(Y_test, Y_test_predict) return score1, score
def orth_signal(x, atol=1e-13, rtol=0): """ Returns signal orthogonal to input ensemble. x -> input singal [n_samples, n_neurons] """ t = np.linspace(0, 1, x.shape[0])[:, None] f = arange(x.shape[1]) / x.shape[1] xt = np.sum(sin(2 * np.pi * f * 3 * t) / (f + 1), axis=1) w = RidgeCV(np.logspace(-6, 3, 50)) w.fit(x, xt) xt = xt - w.predict(x) # pdb.set_trace() return xt
def RidgeCVLinear(train,test): print('starting RidgeCVLinear ...') ridge=RidgeCV(normalize=True,cv=5) train.reindex(np.random.permutation(train.index)) tr_X=train.drop('LogSales',axis=1) tr_Y=train['LogSales'] cutoff=math.floor(0.7*tr_Y.size) ridge.fit(tr_X[:cutoff],tr_Y[:cutoff]) predY=ridge.predict(tr_X[cutoff:]) mspe=rmspe(predY,tr_Y[cutoff:]) print('rmspe is %9f'% mspe) print(train.columns) print(ridge.coef_) print('starting RidgeCVLinear ... completed') return ridge
def stacking(estimators): # training predictions = [] for estim in estimators: estim.fit(X, y) predictions.append(estim.predict(X)) agg = RidgeCV(alphas=alphas, cv=5, normalize=True, fit_intercept=True) # aggregator agg.fit(np.array(predictions).T, y) # test predictions = [] for estim in estimators: predictions.append(estim.predict(test_data)) predictions = agg.predict(np.array(predictions).T) write_results(predictions)
def ridgeRegression(X,Y): """ :param X: data consisting of features (excluding class variable) :param Y: column vector consisting of class variable :return: report best RMSE value for tuned alpha in ridge regression """ tuningAlpha = [0.1,0.01,0.001] # can change to model on the entire dataset but by convention splitting the dataset is a better option # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size = 0.10, random_state = 5) ridge = RidgeCV(normalize=True,scoring='mean_squared_error', alphas=tuningAlpha, cv=10) ridge.fit(X, Y) prediction = ridge.predict(X) print "RIDGE REGRESSION" print "Best Alpha value for Ridge Regression : " + str(ridge.alpha_) print 'Best RMSE for corresponding Alpha =', np.sqrt(mean_squared_error(Y, prediction))
def run(): # Data preprocessing train = DataPrep.prep_data(headless_run) # Scale data: https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use target = train.SalePrice train = train.drop(columns='SalePrice') X_train, X_test, y_train, y_test = train_test_split( train, target, test_size=0.25, random_state=0) # Trying L2 regularization clf = RidgeCV(cv=5).fit(X_train, y_train) # print(rmse_cv(clf).mean()) # Lasso gives us an alpha of 0.1231, picks some coefficients and gives the rest a 0 value coef = pd.Series(clf.coef_, index=X_train.columns) # Metrics variance_score = clf.score(X_test, y_test) MSEscore = mean_squared_error(clf.predict(X_test), y_test) MAEscore = median_absolute_error(clf.predict(X_test), y_test) R2score = r2_score(clf.predict(X_test), y_test) if not headless_run: print('Variance score: {}'.format(variance_score)) # print("CLF best: {}".format(clf.best_score_)) grid search only print('MSE score: {}'.format(MSEscore)) print('MAE score: {}'.format(MAEscore)) print('R2 score: {}'.format(R2score)) # Plotting Residuals plt.scatter(clf.predict(X_train), clf.predict(X_train) - y_train, color="green", s=10, label='Train data') plt.scatter(clf.predict(X_test), clf.predict(X_test) - y_test, color="blue", s=10, label='Test data') plt.hlines(y=0, xmin=10, xmax=14, linewidth=2) plt.legend(loc='upper right') plt.title("Residual errors") plt.show() else: return [variance_score,MSEscore,MAEscore,R2score]
def compute_estimates(zhat, w, y, regularize=True, nuisance=False): """Compute tau_dr, tau_ols, tau_ols_ps, tau_resid on given confounders matrix and w and y.""" tau_hat = dict() ps_hat, y0_hat, y1_hat = get_ps_y01_hat(zhat, w, y, regularize) tau_hat['tau_ols'] = tau_ols(zhat, w, y, regularize) tau_hat['tau_ols_ps'] = tau_ols_ps(zhat, w, y, regularize) tau_hat['tau_dr'] = tau_dr(y, w, y0_hat, y1_hat, ps_hat, regularize=regularize) if regularize: lr = RidgeCV(alphas=(0.1, 1.0, 10.0)) else: lr = LinearRegression() lr.fit(zhat, y) y_hat = lr.predict(zhat) tau_hat['tau_resid'] = tau_residuals(y, w, y_hat, ps_hat, regularize=regularize) if nuisance: return tau_hat, {'ps_hat': ps_hat, 'y0_hat': y0_hat, 'y1_hat': y1_hat} return tau_hat
def ridge_boston(): boston = load_boston() x = boston.data y = boston.target train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=.25) std_s = StandardScaler() train_x = std_s.fit_transform(train_x) test_x = std_s.fit_transform(test_x) # ridge = Ridge(alpha=1.5) ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=4) ridge.fit(train_x, train_y) score = ridge.score(test_x, test_y) predict_y = ridge.predict(test_x) print(score) print(predict_y[:20]) print(test_y[:20]) return None
def model_main_linear(): ts_code = '399300.SZ' x_train, x_test, y_train, y_test = getdata(ts_code, type='linear') # return # print(x_train.shape) # print(x_train) # print(x_test.shape) # print(y_train.shape) # print(y_train) # print(y_test.shape) # 线性回归 model = LinearRegression() # # 线性支持向量机 linearSVC # model = LinearSVC(C=0.01) model.fit(x_train, y_train) y_predictions = model.predict(x_test) r2 = r2_score(y_test, y_predictions) print('intercept:', model.intercept_) print('coef:', model.coef_) # print('y_test:\n', y_test) # print('y_predictions:\n', y_predictions) print('r2:', r2) print('ridge regression:') model_ridge = RidgeCV() model_ridge.fit(x_train, y_train) y_predictions_ridge = model_ridge.predict(x_test) r2_ridge = r2_score(y_test, y_predictions_ridge) print('intercept:', model_ridge.intercept_) print('coef:', model_ridge.coef_) print('r2:', r2_ridge) print('lasso regression:') model_lasso = LassoCV() model_lasso.fit(x_train, y_train) y_predictions_lasso = model_lasso.predict(x_test) r2_lasso = r2_score(y_test, y_predictions_lasso) print('intercept:', model_lasso.intercept_) print('coef:', model_lasso.coef_) print('r2:', r2_lasso)
def train(self, site, exclude_dates): """ Fit the regression model for a site during for the specified window exclude_dates is a an optional set of datetime.date objects to exclude from training cli: pymortar client """ start_train = pd.to_datetime('2016-01-01').tz_localize( 'US/Pacific').isoformat() end_train = pd.to_datetime( datetime.datetime.today().date()).tz_localize( 'US/Pacific').isoformat() alphas = [0.0001, .001, 0.01, 0.05, 0.1, 0.5, 1, 10] # Get data from pymortar data = get_df(site, start_train, end_train) # Get weekdays data['date'] = data.index.date weekdays = get_workdays(start_train, end_train) day_filter = [d in weekdays for d in data['date']] df = data[day_filter] # Exclude dates day_filter = [d not in exclude_dates for d in df.index.date] df = df[day_filter] # Create ridge features df = create_ridge_features(df) # Remove NA rows df = df.dropna() df = df[df['power'] != 0] # Train model X_train, y_train = df.drop(['power', 'weather', 'date'], axis=1), df['power'] model = RidgeCV(normalize=True, alphas=alphas) model.fit(pd.DataFrame(X_train), y_train) # Train Error y_pred = model.predict(pd.DataFrame(X_train)) self.model = model
def ridge(X_train, X_test, y_train, y_test, y_scaler, train_num): """ Ridge regression :param X_train: :param X_test: :param y_train: :param y_test: :param y_scaler: :return: """ # rig = RidgeCV(alphas=[1, 0.5, 0.1, 0.01, 0.05, 0.001, 0.005]) rig = RidgeCV(alphas=[5.0, 10.0]) rig.fit(X_train, y_train) y_pred = rig.predict(X_test) y_test = data.inverse_to_original_data(y_train.reshape(1, -1), y_test.reshape(1, -1), scaler=y_scaler, train_num=train_num) y_pred = data.inverse_to_original_data(y_train.reshape(1, -1), y_pred.reshape(1, -1), scaler=y_scaler, train_num=train_num) evaluate.all_metrics(y_test, y_pred) evaluate.draw_fitting_curve(y_test, y_pred, 0)
def ridgecv(x, y): """ Function for Ridge with Cross Validation :param x: Attributes :param y: Target :return: MSE """ reg = RidgeCV(cv=10).fit(x, y) predictions = reg.predict(x) mse = metrics.mean_squared_error(y, predictions) plt.figure(figsize=(15, 10)) ft_importances_lm = pd.Series(reg.coef_[0], index=x.columns).sort_values() absolute_coefs = pd.Series(reg.coef_[0], index=x.columns) print(absolute_coefs.sort_values(ascending=False)) ft_importances_lm.plot(kind='barh') plt.title("Ridge Coefficents \n Mean Squared Error = %f" % mse, fontsize=18) plt.xlim(-.6, .6) plt.show() print(reg.alpha_) return mse
def linear_model3(): """ 线性回归:岭回归 """ boston = load_boston() x_train, x_test, y_train, y_test = train_test_split( boston.data, boston.target, test_size=0.2) transfer = StandardScaler() x_train = transfer.fit_transform(x_train) x_test = transfer.fit_transform(x_test) #estimator = Ridge(alpha=1) estimator = RidgeCV(alphas=(0.01, 0.1, 1, 10, 100)) estimator.fit(x_train, y_train) y_predict = estimator.predict(x_test) print("预测值为: \n", y_predict) print("模型中的系数为:\n", estimator.coef_) print("模型中的偏置为:\n", estimator.intercept_) error = mean_squared_error(y_test, y_predict) print("误差为:\n", error) return None
class RidgeWithPost(BaseEstimator, TransformerMixin): def __init__(self, weight=1.0): self.ridge = RidgeCV(weight) def fit(self, X, y, sample_weight=None): self.ridge.fit(X, y) return self def predict(self, X): y = self.ridge.predict(X) ranged = np.empty(len(y)) for i in range(0, len(y)): if y[i] < 18: ranged[i] = 18 else: ranged[i] = y[i] return ranged def score(self, X, y, sample_weight=None): return self.ridge.score(X, y)
class FlowModel(object): """Model selection & Xarray compatibility""" def __init__(self, kind, model_config): self.kind = kind if kind == 'neural_net': self.m = FlowModel_DNN(**model_config) elif kind == 'xgboost': self.m = XGBRegressor(**model_config) elif kind == 'Ridge': self.m = RidgeCV(**model_config) else: raise NotImplementedError(str(kind) + ' not defined') def fit(self, Xda, yda, **kwargs): return self.m.fit(Xda, yda, **kwargs) def predict(self, Xda, name=None): # use with xarray, return xarray a = self.m.predict(Xda.values).squeeze() return add_time(a, Xda.time, name=name)
class UnawarePolicy: def __init__(self): self.F_reg = None def train(self, G, L, F): self.F_reg = RidgeCV().fit(np.hstack([G, L]), F) def evaluate(self, G, L, nb_seats=None): assert G.shape == L.shape nb_obs = G.shape[0] if nb_seats is None: nb_seats = nb_obs else: assert isinstance(nb_seats, int) and (nb_seats > 0) nb_seats = min(nb_obs, nb_seats) F_hat = self.F_reg.predict(np.hstack([G, L])) ind = F_hat.argsort(axis=0)[-nb_seats:][::-1] P = np.zeros([nb_obs, 1]).astype(bool) P[ind] = True return P
def predict(self, X): ''' Override ''' results = [] X_ = np.array(X) # prediction for each observation for i in range(0, X_.shape[0]): X_actual = X_[i, :].reshape(1, -1) # X_i - X_actual X_disc = self._X_train - X_actual # ridge ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit( X_disc, self._Y_train) # ridge predictions results.append(ridge.predict(X_actual)[0]) return np.array(results).reshape(-1)
class EegLinearRegression: # Initialise the ridge regression model. def __init__(self, folds=5, regularisation_bound=(0.1, 200)): # The regularisation parameters to select from. alphas = np.logspace(regularisation_bound[0], regularisation_bound[1], num=500) # Uses K-Fold CV here here. Does CV over alphas to get the best one. self.model = RidgeCV(cv=folds, alphas=alphas, scoring="neg_mean_squared_error") # Fit the LR model on the input data. def __train(self, x, y): trained_model = self.model.fit(x, y) # pickle.dump(trained_model,open('ridge_regression.sav','wb')) return trained_model # Test the LR model to get the predicted score for the given input data. def __test(self, x, y): predicted = self.model.predict(x) # return mean_squared_error(predicted, y) return explained_variance_score(predicted, y) # Calculate the nested cv score. def __nested_cv_score(self, x, y, outer_folds=5): x = np.array(x) outer_cv = KFold(n_splits=5, shuffle=True) scores = [] for train_data_ind, test_data_ind in outer_cv.split(x): x_train, x_test = x[train_data_ind], x[test_data_ind] y_train, y_test = y[train_data_ind], y[test_data_ind] # Trains RidgeCV with cross validation. self.__train(x_train, y_train) best_score_for_fold = self.__test(x_test, y_test) scores += [best_score_for_fold] return np.average(np.array(scores)) # Evaluate the LR model by giving an accuracy value. Will accept type of evaluation as well. def evaluate(self, x, y): return self.__nested_cv_score(x, y)
def ridge_cv(self, nsplits: int, lam: float = None): """ runs a cross validation on the data set and returns the cross validation performance :param nsplits: number of cv splits :param lam: tuning parameter :return: the cross-validated mse """ if lam is None: model = RidgeCV(cv=nsplits).fit(self.x, self.y) lam = model.alpha_ cv = KFold(n_splits=nsplits) mse_result = [] for train, test in cv.split(self.x): x_train = self.x[train, :] x_test = self.x[test, :] y_train = self.y[train] y_test = self.y[test] model = Ridge(alpha=lam).fit(x_train, y_train) y_predict = model.predict(x_test) mse_result.append(mse(y_test, y_predict)) return np.mean(mse_result)
def ridge_regression(X_train, y_train, X_test, y_test, plot): """ Perfomring a ridge regression with built in CV and plotting the feature importance """ # Fit the ridge regression reg = RidgeCV() reg.fit(X_train, y_train) print("Best alpha using built-in RidgeCV: %f" % reg.alpha_) print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train)) coef = pd.Series(reg.coef_, index=X_train.columns) print( "Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) # Extract the feature importance imp_coef = coef.sort_values() # Plot the feature importance if plot: plt.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Ridge Model") plt.show() # Visualizing the regression visualizer = ResidualsPlot(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Using the test data to calculate a score y_pred = reg.predict(X_test) # Return metrics return { "name": "Ridge Regression", "R squared": reg.score(X_test, y_test), "R squared training": reg.score(X_train, y_train), "RMSE": rmse(y_test, y_pred), "MAE": mean_absolute_error(y_test, y_pred), }
def Regression_scikit(N, deg, sigma, lamb, method='ls', stats=True): #poly = PolynomialFeatures(degree=deg) #XY = poly.fit_transform(xy) if method == 'ls': linreg = LinearRegression(fit_intercept=False) linreg.fit(XY, z) print("beta 0: ", linreg.intercept_) print("betas : ", linreg.coef_) zpredict = linreg.predict(XY) elif method == 'ridge': ridge = RidgeCV([float(lamb)]) ridge.fit(XY, z) zpredict = ridge.predict(XY) print("beta 0: ", ridge.intercept_) print("betas : ", ridge.coef_) elif method == 'lasso': lasso = Lasso([float(lamb)]) lasso.fit(XY, z) ypredict = lasso.predict(XY) print("beta 0: ", lasso.intercept_) print("betas : ", lasso.coef_) else: print( "Error: 'method' must be either 'ls', 'ridge', or 'lasso'. \nExiting..." ) sys.exit(0) if stats == True: statistics(XY, z, zpredict, deg, lamb, method)
def ridgeRegression(X, Y): """ :param X: data consisting of features (excluding class variable) :param Y: column vector consisting of class variable :return: report best RMSE value for tuned alpha in ridge regression """ tuningAlpha = [0.1, 0.01, 0.001] # can change to model on the entire dataset but by convention splitting the dataset is a better option # X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(X, Y, test_size = 0.10, random_state = 5) ridge = RidgeCV(normalize=True, scoring='mean_squared_error', alphas=tuningAlpha, cv=10) ridge.fit(X, Y) prediction = ridge.predict(X) print "RIDGE REGRESSION" print "Best Alpha value for Ridge Regression : " + str(ridge.alpha_) print 'Best RMSE for corresponding Alpha =', np.sqrt( mean_squared_error(Y, prediction))
def regularization_m(X_re,y_re,predFeat=False): n_alphas=200 alphas=np.logspace(1, 8, n_alphas) coefs=[] n=0 for a in alphas: n+=1 ridge=Ridge(alpha=a, fit_intercept=False) ridge.fit(X_re,y_re) coefs.append(ridge.coef_) # print(n,coefs) ax = plt.gca() ax.plot(alphas, coefs) ax.set_xscale('log') ax.set_xlim(ax.get_xlim()[::-1]) # reverse axis plt.xlabel('alpha') plt.ylabel('weights') plt.title('Ridge coefficients as a function of the regularization') plt.axis('tight') plt.show() ridge=Ridge(alpha=28.6) #Ridge预先确定a值 ridge.fit(X_re,y_re) print(ridge.coef_,ridge.intercept_,ridge.alpha) redgecv=RidgeCV(alphas=alphas) #输入多个a值,模型自行择优选取 redgecv.fit(X_re,y_re) print(redgecv.coef_,redgecv.intercept_,redgecv.alpha_) lasso=Lasso(alpha=0.01) lasso.fit(X_re,y_re) print(lasso.coef_,lasso.intercept_ ,lasso.alpha) elasticnet=ElasticNet(alpha=1.0,l1_ratio=0.5) elasticnet.fit(X_re,y_re) print(elasticnet.coef_,elasticnet.intercept_ ,elasticnet.alpha) if type(predFeat).__module__=='numpy': return redgecv.predict(predFeat)
def train_ridge(x_train, x_valid, y_train, y_valid, classifier): # print('linear_model') preds = [] if classifier == 'RidgeCV': clf = RidgeCV( alphas=[1, 0.1, 0.01, 0.001]) #Ridge regression with built-in cross-validation. if classifier == 'LassoCV': clf = LassoCV(alphas=[1, 0.1, 0.01, 0.001]) if classifier == 'LR': clf = LinearRegression() if classifier == 'BAY': clf = BayesianRidge() if classifier == 'ElaNet': clf = ElasticNetCV(cv=5, random_state=0) if classifier == 'SVM': # Linear Support Vector Regression, no better than chance clf = SVC(gamma='scale', tol=1e-5) if classifier == 'LinearSVC': clf = LinearSVC(tol=1e-10) if classifier == 'SGD': # no better than chance clf = SGDClassifier(loss='log', max_iter=1000000, tol=1e-3) if classifier == 'RF': # no better than chance clf = RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=0) if classifier == 'LR': # no better than chance clf = LogisticRegressionCV(cv=5, random_state=0, multi_class='multinomial') clf.fit(x_train, y_train) y_pred = clf.predict(x_valid) preds.append(y_pred.reshape(-1, 1)) # preds = np.hstack(preds) # print(roc_auc_score(y_valid, preds.mean(1))) # print('roc_auc_score:',roc_auc_score(y_valid, y_pred)) return clf
class myStackingFeaturesRegressor(BaseEstimator, TransformerMixin): def __init__(self): self.estimator = None self.lgb = GradientBoostingRegressor(loss='ls', alpha=0.9, n_estimators=100, learning_rate=0.01, max_depth=8, subsample=0.8, min_samples_split=9, max_leaf_nodes=10) self.grd_enc = OneHotEncoder() self.lr = RidgeCV() self.classes_ = [-1, 1] def fit(self, X, y=None, **fit_params): self.lgb.fit(X, y) self.grd_enc.fit(self.lgb.apply(X)) self.lr.fit(self.grd_enc.transform(self.lgb.apply(X)), y) def predict(self, X): return self.lr.predict(self.grd_enc.transform(self.lgb.apply(X)))
def iris_model(): # import data df = pd.read_csv(r"/home/airflow/gcs/dags/data/Iris.csv") # dummify df = pd.get_dummies(df, drop_first=True) # train test split X_train, X_test, y_train, y_test = train_test_split( df[df.columns[df.columns != "PetalWidthCm"]], df["PetalWidthCm"], test_size=0.2) # cross validataion model = RidgeCV(alphas=np.logspace(-6, 6, 13), cv=3) model.fit(X_train, y_train) # prediction y_pred = model.predict(X_test) # appending results back X_test["Predicted"] = y_pred.round(1) X_test["Actual"] = y_test #rmse addition # rmse = sqrt(mean_squared_error(y_test, y_pred)) # X_test["rmse"] = rmse # save output print('*_*_*_*_*_*_*__*_*_*') with open('/home/airflow/gcs/dags/data/iris_output_' + str(datetime.now().strftime("%d%m%Y%H%M%S")) + '.csv', 'w', newline='') as fp: writer = csv.DictWriter(fp, fieldnames=X_test.columns) writer.writeheader() for row in X_test.to_dict('records'): writer.writerow(row) print(X_test)
class EncodingModel: def __init__(self, method='ridgecv', cv=None, alphas=None): self.method = method self.cv = cv # default uses efficient gcv self.clf = None if alphas is not None: self.alphas = alphas else: self.alphas = [0.001, 0.01, 0.1] self.alphas.extend(np.linspace(1, 10, 10)) def fit(self, X, y): if self.method == 'lr': self.clf = LinearRegression() elif self.method == 'ridgecv': self.clf = RidgeCV(cv=self.cv, alphas=self.alphas) else: raise Exception(f'method {self.method} not implemented') self.clf.fit(X, y) def predict(self, X): return (self.clf.predict(X))
def impute_age(traintest): agetrain = traintest[pd.notnull(traintest['Age'])] agetest = traintest[pd.isnull(traintest['Age'])] columns = agetrain.columns.difference(['Age']) et = ExtraTreesRegressor(n_estimators=50) et.fit(agetrain[columns], agetrain.Age) modelselector = SelectFromModel(et, prefit=True, threshold = 0.01) Xtrainage = modelselector.transform(agetrain[columns]) Xtestage = modelselector.transform(agetest[columns]) knn = KNeighborsRegressor() ridge = RidgeCV(cv = 5) forest = ExtraTreesRegressor(n_estimators=50) ridge.fit(Xtrainage, agetrain.Age) forest.fit(Xtrainage, agetrain.Age) knn.fit(Xtrainage, agetrain.Age) missingAge1 = ridge.predict(Xtestage) missingAge2 = forest.predict(Xtestage) missingAge3 = knn.predict(Xtestage) missingAge = (missingAge1 + missingAge2 + missingAge3)/3 return missingAge
def fitLakeLevels( self, flowData, lakeData, **kwargs ): # model lake levels from stream flows xTrain = self.setDelay( flowData, kwargs[ 'nDays' ] ) flowScaler = preprocessing.StandardScaler().fit( xTrain ) xTrain = flowScaler.transform( xTrain ) self.flowScaler = flowScaler # fit to daily changes in elevation yTrain = lakeData - np.roll( lakeData, 1 ) yTrain[ 0 ] = 0. if kwargs[ 'simpleModel' ]: model = RidgeCV( alphas = np.logspace( -2., 2. ) ) else: model = ExtraTreesRegressor( n_estimators = 50, n_jobs = 4, random_state = 42 ) model.fit( xTrain, yTrain ) self.lakeModel = model ypreds = model.predict( xTrain ) lakePreds = lakeData[ 0 ] + np.cumsum( ypreds ) plt.clf() plt.plot( self.dates, yTrain + lakeData, label = 'Actual' ) plt.plot( self.dates, lakePreds, label = 'Predicted' ) plt.xlabel( 'Date' ) plt.ylabel( 'Lake Travis Elevation (ft)' ) plt.legend() plt.savefig( 'lakelevels.png' )
def scale_test_and_train_ridge(X, y): """ Run a ridge regression on the model """ X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3) X_train_scale = X_train.values X_val_scale = X_val.values X_test_scale = X_test.values scale = StandardScaler() X_train_scale = scale.fit_transform(X_train_scale) X_test_scale = scale.transform(X_test_scale) X_val_scale = scale.transform(X_val_scale) ridge = RidgeCV(cv=5) ridge.fit(X_train_scale, y_train) ridge.score(X_train_scale, y_train) y_pred = ridge.predict(X_val_scale) print(f'Ridge Regression val R^2: {ridge.score(X_val_scale, y_val):.3f}') print( f'Ridge Regression val RME: {sqrt(mean_squared_error(y_val,y_pred)):.3f}' ) return ridge.coef_
def predict(self, X): ''' Override ''' results = [] X_ = np.array(X) # prediction for each observation for i in range(0, X_.shape[0]): X_actual = X_[i, :].reshape(1, -1) # we can calulate the coefficients for one row at a time actual_leaf_ids = self._extract_leaf_nodes_ids(X_actual) # calculate coefficients weights alpha_i(X_actual) alphas = self._get_forest_coefficients(actual_leaf_ids) # X_i - X_actual X_disc = self._X_train - X_actual # ridge ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit( X_disc, self._Y_train, alphas) # ridge predictions results.append(ridge.predict(X_actual)[0]) return np.array(results).reshape(-1)
def ridgecv(): data = init_data() X = data[:, 1:] #得到的是二维数组,每行代表一条数据,岭回归训练的时候X是一个二维数组,每行代表一条数据.Y是一个一位数组 Y = data[:, 0] #得到的是一维数组 X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=42) # 利用train_test_split方法,将X,y随机划分问,训练集(X_train),训练集标签(X_test),测试卷(y_train) alphas = [0.03, 0.05, 0.1, 0.2, 0.4, 0.5, 0.6, 1, 1.5, 2] ridgecv = RidgeCV(alphas, store_cv_values=True) ridgecv.fit(X_train, y_train) # #自己通过计算选择 # smallest_idx=ridgecv.cv_values_.mean(axis=0).argmin() #要使用cv_values_需要在建立ridgecv指定store_cv_values=True # #在第0维度上进行平均,也就是计算不同的alpha下,在所有样本上的平均误差,最后形成12维数组,因为alpha可以取到12个 # print("自己计算",alphas[smallest_idx]) smallest_idx = ridgecv.cv_values_.mean(axis=0).argmin() f, ax = plt.subplots(figsize=(7, 5)) ax.set_title(r"various values of a") xy = (alphas[smallest_idx], ridgecv.cv_values_.mean(axis=0)[smallest_idx]) xytext = (xy[0] + .01, xy[1] + .1) ax.annotate(r'choose this a', xy=xy, xytext=xytext, arrowprops=dict(facecolor='black', shrink=0, width=0)) #https://blog.csdn.net/qq_30638831/article/details/79938967 ax.plot(alphas, ridgecv.cv_values_.mean(axis=0)) plt.show() print("sklearn指定最优alpha值:", ridgecv.alpha_) print(ridgecv.coef_) print(ridgecv.intercept_) test_Y_pred = ridgecv.predict(X) print("测试集MSE:", mean_squared_error(Y, test_Y_pred))
#else: #model = GPy.models.GPRegression(X_train, Y_train, kernel=k) icmk = GPy.util.multioutput.ICM(input_dim=X.shape[1], num_outputs=6, kernel=k, W_rank=args.rank) model = GPy.models.GPCoregionalizedRegression(X_train_list, Y_train_list, kernel=icmk) model.optimize(messages=True, max_iters=100) print model # Get predictions info_dict = {} preds_list = [] vars_list = [] if args.model == 'ridge' or args.model == 'svr': preds = model.predict(X_test) if args.label_preproc == 'scale': preds = Y_scaler.inverse_transform(preds) elif args.label_preproc == 'warp': preds += 50 info_dict['mae'] = MAE(preds, Y_test.flatten()) info_dict['rmse'] = np.sqrt(MSE(preds, Y_test.flatten())) info_dict['pearsonr'] = pearsonr(preds, Y_test.flatten()) else: # TODO: check if this makes sense #preds, vars = model.predict(X_test) #X_test_pred, Y_test_pred, index = GPy.util.multioutput.build_XY(X_test_list, Y_test_list) #noise_dict = {'output_index': X_test_pred[:,-1:].astype(int)} #preds, vars = model.predict_noiseless(X_test, Y_metadata=noise_dict) for emo_id, emo in enumerate(EMOS): # TODO: preprocessing
trainingFeatures['network'] = predictions predictions = net.activateOnDataset(testDs) testingFeatures['network'] = predictions #%% trainingFeaturesPca, testingFeaturesPca = getPca(trainingFeatures, trainingTarget, testingFeatures, 3) for col in trainingFeaturesPca.columns: trainingFeatures[col] = trainingFeaturesPca[col] testingFeatures[col] = testingFeaturesPca[col] #%% model = RidgeCV(alphas=[0.01, 1.0, 10.0]) model.fit(trainingFeatures, trainingTarget) predictions = model.predict(trainingFeatures) trainingFeatures['RidgeCV'] = predictions predictions = model.predict(testingFeatures) testingFeatures['RidgeCV'] = predictions #%% model = SGDRegressor() model.fit(trainingFeatures, trainingTarget) predictions = model.predict(trainingFeatures) trainingFeatures['SGDRegressor'] = predictions predictions = model.predict(testingFeatures) testingFeatures['SGDRegressor'] = predictions #%%
scoring='mean_absolute_error', cv=10) ozone_ridgecv_reg = ozone_ridgecv_reg.fit(ozone_train.drop('ozone', axis=1), ozone_train['ozone']) ## Compare regularization models print("Linear Coef: " + str(ozone_ln_reg.coef_) + "\nRidge Coef: " + str(ozone_ridge_reg.coef_) + "\nLasso Coef: " + str(ozone_lasso_reg.coef_) + "\nCV Coef: " + str(ozone_ridgecv_reg.coef_) + "\nCV alpha: " + str(ozone_ridgecv_reg.alpha_)) # Predict using models and evaluate ozone_ln_pred = ozone_ln_reg.predict(ozone_test.drop('ozone', axis=1)) ozone_ridge_pred = ozone_ridge_reg.predict(ozone_test.drop('ozone', axis=1)) ozone_lasso_pred = ozone_lasso_reg.predict(ozone_test.drop('ozone', axis=1)) ozone_ridgecv_pred = ozone_ridgecv_reg.predict(ozone_test.drop('ozone', axis=1)) ## Calculate MAE, RMSE, and R-squared for all models ozone_ln_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ln_pred) ozone_ln_rmse = sqrt(metrics.mean_squared_error(ozone_test['ozone'], ozone_ln_pred)) ozone_ln_r2 = metrics.r2_score(ozone_test['ozone'], ozone_ln_pred) ozone_ridge_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ridge_pred) ozone_ridge_rmse = sqrt(metrics.mean_squared_error(ozone_test['ozone'], ozone_ridge_pred)) ozone_ridge_r2 = metrics.r2_score(ozone_test['ozone'], ozone_ridge_pred) ozone_lasso_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_lasso_pred) ozone_lasso_rmse = sqrt(metrics.mean_squared_error(ozone_test['ozone'], ozone_lasso_pred)) ozone_lasso_r2 = metrics.r2_score(ozone_test['ozone'], ozone_lasso_pred) ozone_ridgecv_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ridgecv_pred)
# temp_train are the intermediate training data, ie, outputs of the 3 level-0 learners, also inputs of the level-1 learner temp_train = np.zeros(( len(Y2) ,len(clfs) )) temp_test=np.zeros(( Xtest.shape[0] ,len(clfs) )) for i, clf in enumerate(clfs): clf.fit(X1,Y1) # train each level-0 learner temp_train[:,i] = clf.predict(X2) # intermediate data for level-1 learner given data X2 are generated temp_test[:,i] = clf.predict(Xtest) # intermediate data for level-1 learner given data Xtest are also generated # ====================== Training the level-1 learner =================== # level-1 learner # cv = 5: 5 folds cross validation alphas = [0.0001, 0.005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0] stk = RidgeCV(alphas=alphas, normalize=True, cv=5).fit(temp_train, Y2) # ====================== Prediction =================== # predict the test data and write output Y_hat to .csv file Y_hat = stk.predict(temp_test) fh = open('n_50_predictions.csv','w') # open file for upload fh.write('ID,Prediction\n') # output header line for i,yi in enumerate(Y_hat): fh.write('{},{}\n'.format(i+1,yi)) # output each prediction fh.close() print 'Writing finished!'
model_elastic, train_x, y, scoring='neg_mean_squared_error', cv=kfold).mean()) + "\nTime of " + str(t_elastic)) pred_elastic = pd.DataFrame( data=np.expm1(model_elastic.predict(test_x)), # values index=range(TRAIN_ROWS, TRAIN_COLS), #Set Index columns=['SalePrice']) # 1st column as index #Use Ridge to Cross Validate and Model t3 = time.time() model_ridge = RidgeCV(alphas=alphas).fit(train_x, y) t_ridge = time.time() - t1 print("\nRidge Score with CV: " + str(-1 * cross_val_score( model_ridge, train_x, y, scoring='neg_mean_squared_error', cv=kfold).mean()) + "\nTime of " + str(t_ridge)) pred_ridge = pd.DataFrame( data=np.expm1(model_ridge.predict(test_x)), # values index=range(TRAIN_ROWS, TRAIN_COLS), #Set Index columns=['SalePrice']) # 1st column as index #Use Random Forest to make estimator from sklearn.ensemble import RandomForestRegressor t4 = time.time() clf = RandomForestRegressor().fit(train_x, np.asarray(y, dtype="|S6")) t_rf = time.time() - t4 print("\nRandom Forest Score with CV: " + str(np.mean(cross_val_score(clf, train_x, y, cv=10))) + "\nTime of " + str(t_rf)) pred_rf = pd.DataFrame( data=np.expm1(clf.predict(test_x)), # values index=range(TRAIN_ROWS, TRAIN_COLS), #Set Index columns=['SalePrice']) # 1st column as index
ridge_model = Ridge( solver='auto', fit_intercept=True, alpha=1.0, max_iter=100, normalize=False, tol=0.05, random_state = 1, ) ridge_modelCV = RidgeCV( fit_intercept=True, alphas=[5.0], normalize=False, cv = 2, scoring='neg_mean_squared_error', ) ridge_model.fit(X_train, Y_train) ridge_modelCV.fit(X_train, Y_train) Y_dev_preds_ridge = ridge_model.predict(X_dev) Y_dev_preds_ridge = Y_dev_preds_ridge.reshape(-1, 1) print("RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridge)) Y_dev_preds_ridgeCV = ridge_modelCV.predict(X_dev) Y_dev_preds_ridgeCV = Y_dev_preds_ridgeCV.reshape(-1, 1) print("CV RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds_ridgeCV)) ridge_preds = ridge_model.predict(X_test) ridge_preds = np.expm1(ridge_preds) ridgeCV_preds = ridge_modelCV.predict(X_test) ridgeCV_preds = np.expm1(ridgeCV_preds) def aggregate_predicts3(Y1, Y2, Y3, ratio1, ratio2): assert Y1.shape == Y2.shape return Y1 * ratio1 + Y2 * ratio2 + Y3 * (1.0 - ratio1-ratio2) # Y_dev_preds = aggregate_predicts3(Y_dev_preds_rnn, Y_dev_preds_ridgeCV, Y_dev_preds_ridge, 0.4, 0.3) # print("RMSL error for RNN + Ridge + RidgeCV on dev set:", rmsle(Y_dev, Y_dev_preds))
rows=[] ys=[] while rowindex<N: rowindex = rowindex+1; data =raw_input().split() feature = [float(data[0]),float(data[1])] #print np.vander(feature,5).flatten() rows.append(np.vander(feature,5).flatten()) ys.append(float(data[-1])) #print rows ridge = RidgeCV(alphas=[0.1,1.0,10.0]) ridge.fit(rows,ys) print ridge.alpha_ print ridge.coef_ print ridge.intercept_ predictNum = int(raw_input()) rowindex=0 rows=[] while rowindex<predictNum: rowindex = rowindex+1; data =raw_input().split() feature = [float(data[0]),float(data[1])] rows.append(np.vander(feature,5).flatten()) for value in ridge.predict(rows): print value
for l in range(len(bvar)): ind=int(uni(0, 1)*len(bvar)) ar.append(bvar[ind][1]) ar1.append(bvar[ind][2]) y.append(bvar[ind][0]) #write as arrays, stack them ar=np.array(ar); ar1=np.array(ar1); y=np.array(y) A=np.vstack([ar, ar1, np.ones(len(bvar))]).T #cross-validate the ridge regression cl=RidgeCV(alphas=[0.5, 1.0, 50.0, 500.0]) #cl=Ridge(alpha=1.0) cl.fit(A, y) #if cl.coef_[0]>=0: i+=1 #arrays for predicted values and for the a, b, c coefficients val_arr.append(cl.predict([32.21, 31.01, 1.])) coef_arr.append([cl.coef_[0], cl.coef_[1], cl.intercept_]) print 'The mean and standard deviation for this object is ' print np.std(val_arr), np.mean(val_arr) coef_arr=np.array(coef_arr) print "Coefficients of the ridge and their standard deviations " print np.mean(coef_arr[:,0]), np.std(coef_arr[:,0]), np.mean(coef_arr[:,1]), np.std(coef_arr[:,1]), np.mean(coef_arr[:,2]), np.std(coef_arr[:,2]) #plot the coefficient arrays plt.hist(coef_arr[:,1], alpha=0.3) plt.hist(coef_arr[:,0], alpha=0.3) plt.show()
time_taken = end_time - start_time print ("Time taken for pre-blending calculations: ", time_taken) print ("CV-Results", cv_results) # Start blending! print ("Blending models.") alphas = [0.0001, 0.005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0] bclf = RidgeCV(alphas=alphas, normalize=True, cv=5) bclf.fit(blend_train, Y_dev) print ("Ridge Best alpha = ", bclf.alpha_) # Predict now Y_test_predict = bclf.predict(blend_test) if (DEVELOP): score1 = metrics.mean_absolute_error(Y_test, Y_test_predict) score = normalized_gini(Y_test, Y_test_predict) print ('Ridge MSE = %s normalized Gini = %s' % (score1, score)) else: # Submit! and generate solution score = cv_results.mean() print ('Avg. CV-Score = %s' % (score)) #generate solution submission = pd.DataFrame({"Id": testidx, "cost": Y_test_predict}) submission = submission.set_index('Id') submission.to_csv("bench_gen_stacking.csv")
# - **alphas:** array of alpha values to try # create an array of alpha values alpha_range = 10.**np.arange(-2, 3) alpha_range # select the best alpha with RidgeCV from sklearn.linear_model import RidgeCV ridgeregcv = RidgeCV(alphas=alpha_range, normalize=True, scoring='mean_squared_error') ridgeregcv.fit(X_train, y_train) ridgeregcv.alpha_ # predict method uses the best alpha value y_pred = ridgeregcv.predict(X_test) print np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # ### Lasso regression # # - [Lasso](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html) documentation # - **alpha:** must be positive, increase for more regularization # - **normalize:** scales the features (without using StandardScaler) # try alpha=0.001 and examine coefficients from sklearn.linear_model import Lasso lassoreg = Lasso(alpha=0.001, normalize=True) lassoreg.fit(X_train, y_train) print lassoreg.coef_
embs = util.load_embs(EMBS) X = [] with open(INPUTS) as f: for line in f: X.append(util.preprocess_sent(line.split('_')[1])) Y = np.loadtxt(LABELS)[:, 1] ################### # PREPROCESS X X = np.array([util.average_sent(sent, embs) for sent in X]) #print X #print X.shape #################### # RIDGE m = RidgeCV() #m = KernelRidge(kernel='rbf') #m = Ridge() m.fit(X[:SPLIT], Y[:SPLIT]) preds = m.predict(X[SPLIT:]) Y_test = Y[SPLIT:] for tup in zip(preds, Y_test)[:20]: print tup print MAE(preds, Y_test) print np.sqrt(MSE(preds, Y_test)) print pearsonr(preds, Y_test)
# Save regressors pickle_file = 'regressor.pickle' try: f = open(pickle_file, 'wb') save = { 'random_forest_regressor': rfr, 'ridge': ridge, } pickle.dump(save, f, pickle.HIGHEST_PROTOCOL) f.close() except Exception as e: print('Unable to save data to', pickle_file, ':', e) raise # Load regressor pickle_file = 'regressor.pickle' with open(pickle_file, 'rb') as f: save = pickle.load(f) rfr = save['random_forest_regressor'] ridge = save['ridge'] del save # Predict test_data y_pred_rfr = rfr.predict(X_test) y_pred_ridge = ridge.predict(X_test) pd.DataFrame({'X1_Random_Forest': y_pred_rfr, 'X1_Ridge': y_pred_ridge}).to_csv('Results from Hang Yao.csv')
train, test, train_ret, test_ret, train_stock, test_stock = \ train_test_split(inst, ret, stock, test_size=0.4, random_state=1) # SVR modeling from sklearn.svm import SVR from sklearn.linear_model import RidgeCV from sklearn.feature_selection import RFE rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) poly = SVR(kernel='poly', C=1e3, degree=2) rig=RidgeCV() rig.fit(train, train_ret) rig.coef_ test_predict=rig.predict(test) hits= ((test_ret>0) & (test_predict>0)) | ((test_ret<0) & (test_predict<0)) hit_ratio=1.0*sum(hits)/len(test_ret) plt.figure(2) plt.subplot(1,2,1) plt.plot(test_ret, 'ko') plt.plot(test_predict, 'ro') plt.ylim([-1,1]) plt.xlim([0,len(test_ret)]) plt.plot([0,100],[0,0],'g--') plt.xticks(range(1,len(test_ret)), test_stock, rotation='vertical') plt.title('Actual and Predicted Returns') plt.tick_params(axis='x', labelsize=5)
# cv(model, X[:m], Y[:m]) # cv(model, X, Y) print OKGREEN print "Done building models" print ENDC # ----------------------------------------------------------- # Predictions print OKBLUE print "Making predictions" print ENDC clfP = clf.predict(clfTestX) linP = linreg.predict(newTestX) print OKGREEN print "Done making predictions" print ENDC # ----------------------------------------------------------- # Analyze residuals print OKGREEN print "Analyzing residuals: " print "The following variables shall be defined." print "Indices in various array correspond to one another" print "------------------------------------------------------------------------------" print "msp_LinearRegression_i : indices of mispredictions > 5" print "msp_LinearRegression_y : mispredicted labels" print "msp_LinearRegression : mispredictions"
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ############################################################################### # At first, a linear model will be applied on the original targets. Due to the # non-linearity, the model trained will not be precise during the # prediction. Subsequently, a logarithmic function is used to linearize the # targets, allowing better prediction even with a similar linear model as # reported by the median absolute error (MAE). f, (ax0, ax1) = plt.subplots(1, 2, sharey=True) regr = RidgeCV() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) ax0.scatter(y_test, y_pred) ax0.plot([0, 2000], [0, 2000], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text(100, 1750, r'$R^2$=%.2f, MAE=%.2f' % ( r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1) regr_trans.fit(X_train, y_train)
X_pred = X[-predPeriod:] X = X[:-predPeriod] #re-sizing the features for training dataset.dropna(inplace=True) # get rid of naN for 'label' column # create label y = np.array(dataset['label']) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=1) # use linearRegression as algrithm #clf = LinearRegression() clf = RidgeCV (alphas =[0.1, 0.5, 1, 10]) clf.fit(X_train, y_train) #start_time = time.time() y_pred = clf.predict(X_pred) #print time.time() - start_time accuracy = clf.score(X_test, y_test) # visualize Learning Curves #ML.ModelLearning(X, y) #ML.ModelComplexity(X_train, y_train) #Linear slope calculation #print clf.alpha_ #print clf #print clf.coef_ #print clf.intercept_ print 'predict accuracy is: {:0.2f}'.format(accuracy) # build a column in data for predict result
def rigRegModel(X, T, y): ridge = RidgeCV(alphas=[0.1, 2.0, 5.0]) ridge.fit(X, y) pre_y = ridge.predict(T) # print(pre_y) return pre_y
def analyze(): global test global train global in_cols global target_col global ss global features global fields #ADD DATA FROM INPUT FIELDS TO TEST FILE addFeatures('C:/Users/Kwabena-Kobiri/Desktop/test.csv', features, fields) #ID_output['text'] = features[0] ID_output['text'] = features['ID'] #Modules imported for Analysis import pandas as pd from sklearn.linear_model import RidgeCV from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score test = pd.read_csv('C:/Users/Kwabena-Kobiri/Desktop/test.csv', parse_dates=['date']) train = pd.read_csv('C:/Users/Kwabena-Kobiri/Desktop/train.csv', parse_dates=['date']) #ADD FEATURES DATAFRAME OBJECT TO THE TEST DATA SET #new_features.to_csv('test.csv', mode='a', header=False) #Data split for validation train_all = train.copy().dropna() train = train_all.loc[train_all.date < '2011-01-01'] valid = train_all.loc[train_all.date > '2011-01-01'] #print(train.shape, valid.shape) # Define input and output columns in_cols = train.columns[6:] target_col = 'burn_area' #in_cols #clear() # Get our X and y training and validation sets ready X_train, y_train = train[in_cols], train[target_col] X_valid, y_valid = valid[in_cols], valid[target_col] # Create and fit the model model = RidgeCV() model.fit(X_train, y_train) # Make predictions preds = model.predict(X_valid) #preds = model.predict(features[2:]) # Score #mean_squared_error(y_valid, preds)**0.5 # RMSE - should match Zindi score. Lower is better #VISUALIZE SUBMISSION FILE ss = pd.read_csv('C:/Users/Kwabena-Kobiri/Desktop/SampleSubmission.csv') #ss.head() # So we need to predict the burn area for each row in test. # Add the same features to test as we did to train: test['month'] = test.date.dt.month test['year'] = test.date.dt.year #donar_train['project_submitted_datetime'] = pd.to_datetime(donar_train.project_submitted_datetime, format='%d-%m-%Y %H:%M') # Get predictions preds = model.predict( test[in_cols].fillna(0) ) # fillna(0) here could be improved by examining the missing data and filling more appropriately. # Add to submission dataframe ss['Prediction'] = preds # View #ss.head() # Save ready for submission: ss.to_csv('C:/Users/Kwabena-Kobiri/Desktop/SampleSubmissionOG.csv', index=False) new = pd.read_csv('C:/Users/Kwabena-Kobiri/Desktop/SampleSubmissionOG.csv') #prediction_output['text'] = round(new.Prediction[1], 5) prediction_output['text'] = round(new.Prediction[1], 5)
ds = range(20150701, 20150732) + range(20150801, 20150831) X3 = np.reshape(np.array(range(122, 184)), (-1, 1)).astype(np.int) for song_id in song_ids: # Model training sql = "select ds, play_times from music_datas " + \ "where song_id=='%s' and ds<'20150701'" % song_id cu2 = conn.cursor() cu2.execute(sql) ret = cu2.fetchall() X1, Y1 = generate_np_data(ret) # print X1, Y1 clf.fit(X1, Y1) # Predict Y3 = clf.predict(X3).tolist() # break predicts = [] for (x, y) in zip(ds, Y3): if y < 0: y = 0 predicts.append((song_id[0], x, round(y))) cu2.executemany('insert into music_prediction values (?, ?, ?)', predicts) # process pro.ins().show() print "alpha: %f" % clf.alpha_ conn.commit() conn.close()
f.tight_layout(rect=[0.05, 0.05, 0.95, 0.95]) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ############################################################################### # At first, a linear model will be applied on the original targets. Due to the # non-linearity, the model trained will not be precise during the # prediction. Subsequently, a logarithmic function is used to linearize the # targets, allowing better prediction even with a similar linear model as # reported by the median absolute error (MAE). f, (ax0, ax1) = plt.subplots(1, 2, sharey=True) regr = RidgeCV() regr.fit(X_train, y_train) y_pred = regr.predict(X_test) ax0.scatter(y_test, y_pred) ax0.plot([0, 2000], [0, 2000], '--k') ax0.set_ylabel('Target predicted') ax0.set_xlabel('True Target') ax0.set_title('Ridge regression \n without target transformation') ax0.text( 100, 1750, r'$R^2$=%.2f, MAE=%.2f' % (r2_score(y_test, y_pred), median_absolute_error(y_test, y_pred))) ax0.set_xlim([0, 2000]) ax0.set_ylim([0, 2000]) regr_trans = TransformedTargetRegressor(regressor=RidgeCV(), func=np.log1p, inverse_func=np.expm1)
mask = np.logical_or(conditions == 'face', conditions == 'house') fmri = fmri[mask] session_id_fmri = session_id_fmri[mask] conditions = conditions[mask] """ train_index = np.where(session_id_fmri != 6) test_index = np.where(session_id_fmri == 6) # Split into train and test sets fmri_train, fmri_test = (fmri[train_index], fmri[test_index]) design_train, design_test = design[train_index], design[test_index] stimuli_train, stimuli_test = stimuli[train_index], stimuli[test_index] ridge = RidgeCV() ridge.fit(fmri_train, design_train) prediction = ridge.predict(fmri_test) # ridge_coef = ridge.coef_[4] # 'face' vs. 'house' ridge_coef = -ridge.coef_[3] + ridge.coef_[4] # 'face' vs. 'house' coef_img = masker.inverse_transform(ridge_coef) coef_map = coef_img.get_data() threshold = np.percentile(np.abs(coef_map), 98) # Plot stat map plot_stat_map(coef_img, bg_img=haxby_dataset.anat[0], display_mode='z', cut_coords=[-5], title=model+" weights") """ # Plot time-series onset = int(onsets[6][3]/tr) time_series = prediction[onset + delay: onset + delay + tr, 1] plt.plot(time_series)
regr_svm = svm.SVR() regr_rfr = RandomForestRegressor(n_estimators=10, random_state=None) regr_knn = KNeighborsRegressor(n_neighbors=5) regr_ridge = RidgeCV() regr.fit(day_target_train, price_data_train) regr_svm.fit(day_target_train, price_data_train) regr_rfr.fit(day_target_train, price_data_train) regr_knn.fit(day_target_train, price_data_train) regr_ridge.fit(day_target_train, price_data_train) prediction = regr.predict(day_target_test) prediction_svm = regr_svm.predict(day_target_test) prediction_rfr = regr_rfr.predict(day_target_test) prediction_knn = regr_knn.predict(day_target_test) prediction_ridge = regr_ridge.predict(day_target_test) print(prediction) print(prediction_svm) print(prediction_rfr) print(prediction_knn) print(prediction_ridge) print(price_data_test) plt.plot(day_target_train, price_data_train, 'r') plt.plot(day_target_test, price_data_test, 'g') plt.plot(day_target_test, prediction_knn, 'c') plt.plot(day_target_test, prediction_svm, 'b') plt.plot(day_target_test, prediction, 'k') plt.plot(day_target_test, prediction_ridge, 'y') plt.plot(day_target_test, prediction_rfr, 'y')
names =["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS",\ "RAD","TAX","PTRATIO","B","STAT"] features = get_features_matrix(np.loadtxt('../housing_train.dat')) labels = np.loadtxt('../housing_prices_train.dat') alphas = 10**np.linspace(-5, -1, 100) model = RidgeCV(normalize=True, store_cv_values=True, alphas=alphas) model.fit(features, labels) print model.coef_ print model.alpha_ cv_errors = np.mean(model.cv_values_, axis=0) # Test the model features_test = get_features_matrix(np.loadtxt('../housing_test.dat')) prices_test = np.loadtxt('../housing_prices_test.dat') prices_pred = model.predict(features_test) score = np.mean((prices_test-prices_pred)**2) print 'Score', score pl.semilogx(alphas, cv_errors) pl.figure() ax = pl.subplot(111) pl.plot(range(len(model.coef_)), model.coef_, 'o') #ax.set_xticklabels(names) #ax.set_xticks(range(len(model.coef_))) pl.show()