def fit_ridge_cv(theta_matrix, X_matrix, alpha=[0]): reg = RidgeCV(alphas=alpha, fit_intercept=False) reg.fit(theta_matrix, X_matrix) gamma_vector = reg.coef_ loss = np.mean(np.square(reg.predict(theta_matrix) - X_matrix)) score = reg.score(theta_matrix, X_matrix) return gamma_vector, loss, score, reg.alpha_
def Ridge_model(train_linear, test_linear): ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(train_linear_fea, train_linear_tar) ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) #ridge.set_params(alpha=6,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) write_pkl(ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl') return test_prediction_ridge
def fitModel(self, Xindex, Findex, X, Y): if not hasattr(self, "model"): self.model = [] if "ridge_alpha" in self.args: alpha = self.args['ridge_alpha'] else: ridgecv = RidgeCV(alphas=(10., 50., 100.)) ridgecv.fit(X, Y[:, 0]) logging.info("Ridge cv %f", ridgecv.alpha_) logging.info("Ridge score %f", ridgecv.score(X, Y[:, 0])) alpha = ridgecv.alpha_ global ridgeX, ridgeY, ridgeAlpha ridgeX = X ridgeY = Y ridgeAlpha = alpha for idx in self.divide(list(range(len(self.model), Y.shape[1])), 18): with mp.Pool(6) as pool: self.model += pool.map(train_ridge, idx) logging.info("Ridge group %d", idx[0]) self.saveModel() logging.info("Ridge ok")
def fitRidge(self, X, Y, name="ridge"): if not hasattr(self, name): ridge = [] setattr(self, name, ridge) else: ridge = getattr(self, name) if "ridge_alpha" in self.args: alpha = self.args['ridge_alpha'] else: ridgecv = RidgeCV(alphas=(10., 50., 100.)) ridgecv.fit(X, Y[:, 24]) logging.info("Ridge cv %f", ridgecv.alpha_) logging.info("Ridge score %f", ridgecv.score(X, Y[:, 24])) alpha = ridgecv.alpha_ global ridgeX, ridgeY, ridgeAlpha ridgeX = X ridgeY = Y ridgeAlpha = alpha for idx in self.divide(list(range(len(ridge), Y.shape[1])), 18): with mp.Pool(6) as pool: ridge += pool.map(train_ridge, idx) logging.info("Ridge group %d", idx[0]) self.saveModule(name, False) logging.info("Ridge ok")
def train_rigdeCV(self, data): train, validacion = data x_tr, y_tr = train x_val, y_val = validacion #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1])) #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1])) print('Start training Ridge...') start_time = self.timer() ridge = RidgeCV(normalize=True, alphas=[0.0000999], cv=10) ridge.fit(x_tr, y_tr) print("The R2 is: {}".format(ridge.score(x_tr, y_tr))) print("The alpha choose by CV is:{}".format(ridge.alpha_)) self.timer(start_time) print("Making prediction on validation data") y_val = np.expm1(y_val) y_val_pred = np.expm1(ridge.predict(x_val)) mae = mean_absolute_error(y_val, y_val_pred) print("El mean absolute error de es {}".format(mae)) print('Saving model into a pickle') try: os.mkdir('pickles') except: pass with open('pickles/RidgeCV.pkl', 'wb') as f: pickle.dump(ridge, f) print('Making prediction and saving into a csv') y_test = ridge.predict(self.x_test) return y_test
def Ridge_Data(context,x,y): x,y = data_process(x,y) # standardize independent variables and response variable (necessary for ridge regularization) from sklearn.preprocessing import StandardScaler x_std = StandardScaler().fit_transform(x) y_std = StandardScaler().fit_transform(y.reshape(-1,1)) # Ridge Regression RidgeCV() default: # alphas=(0.1, 1.0, 10.0) # fit_intercept=True # cv=None: to use the efficient Leave-One-Out cross-validation # cv=int: to specify the number of folds from sklearn.linear_model import RidgeCV ridgecv = RidgeCV() ridgecv.fit(x_std, y_std) ridgecv_score = ridgecv.score(x_std, y_std) # ridge panelty: alpha ridgecv_alpha = ridgecv.alpha_ print('Ridge R square', ridgecv_score) print('Ridge Alpha', ridgecv_alpha ) print('Ridge Coefficients',ridgecv.coef_) # Estimated coefficients are the same!! import math k = len(x_std[0]) y_ridge = np.append(y_std,np.zeros(k)) x_ridge = np.append(x_std,np.identity(k)*math.sqrt(ridgecv_alpha),axis=0)
def ridge_regression_cv(problem, **kwargs): r"""High level description. Parameters ---------- problem : type Description kwargs : dictionary kwargs['ridge_reg_coefs'] must be a list of nonnegative float. These are the multipliers for the penalty term in cross-validation of ridge regression kwargs['coef_tolerance'] must be a nonnegative float Returns ------- output : tuple (optimum, maximum) """ data_list = [datum['data']['values'] for datum in problem.data] data = numpy.array(data_list) ridge = RidgeCV(kwargs['ridge_reg_coefs']) ridge.fit(data.T, problem.goal['data']['values']) ridge_regression_coefficients = ridge.coef_ optimum = [problem.data[index] for index,element in enumerate(ridge_regression_coefficients) if abs(element) > kwargs['coef_tolerance']] maximum = ridge.score(data.T, problem.goal['data']['values']) output = (optimum, maximum) return output
def Ridge_model(train_linear, test_linear): ridgecv = RidgeCV(alphas=np.logspace(-5, 4, 400)) ridgecv.fit(train_linear_fea, train_linear_tar) ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ", ridgecv_score) coef = pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending=False) start = time.time() ridge = Ridge(normalize=True) ridge.set_params(alpha=ridgecv_alpha, max_iter=10000) #ridge.set_params(alpha=6,max_iter = 10000) ridge.fit(x_train, y_train) end = time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge = pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending=False) evaluate(ridge, x_test, y_test, x_train, y_train) print('Time elapsed: %.4f seconds' % (end - start)) y_ridge_predict = ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line = x_line plt.scatter(real_train_tar, np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge = np.expm1(ridge.predict(test_linear)) write_pkl( ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl' ) return test_prediction_ridge
def regularizedreg(Xtrain,Xtest,ytrain,ytest): Rclf = RidgeCV(alphas=[1,2,20,40,50]) # RidgeCV(alphas=[0.1, 1.0, 2.0, 4.0, 20.0], cv=None, fit_intercept=True, scoring=None, normalize=False) Rclf.fit(Xtrain,ytrain); print("Residual sum of squares: %.2f" % np.mean((Rclf.predict(Xtest) - ytest) ** 2)) print('Regularization choosen, alpha = %.2f' % Rclf.alpha_); print(' Coef values = ', Rclf.coef_); print('Variance score: %.2f' % Rclf.score(Xtest, ytest))
def forregCV(X, y, regressor): kf = RepeatedKFold(n_splits=5, n_repeats=30) if regressor == ridge_regression: print("ridge") regressor = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], cv=kf).fit(X, y) print('Ridge CV: ', regressor.score(X, y)) return if regressor == lasso_regression: print("lasso") regressor = LassoCV(alphas=[1e-3, 1e-2, 1e-1, 1, 10], cv=kf).fit(X, y) print('Lasso CV: ', regressor.score(X, y)) return #REPEATED KFOLD CROSS-VAL # midl = [] scores = cross_val_score(regressor, X, y, cv=kf, n_jobs=-1) print("---> ", scores.mean())
def RidgeKFold(data=data, city='all', label="label_activity_density"): if city == 'all': data2 = data.copy() else: data2 = data[data["city_district"].str.contains(city)].copy() target = data2[["city_district", label]] features = data2[features_columns] X = features.values y = target[label].values clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y) print(clf.score(X, y)) return (clf.score(X, y))
def ridge_reg(x, y): ridgecv = RidgeCV(alphas=(0.1, 10, 50), cv=20) ridgecv.fit(x, y) ridgecv_score = ridgecv.score(x, y) ridgecv_alpha = ridgecv.alpha_ print('Ridge R square', ridgecv_score) print('Ridge Alpha', ridgecv_alpha) return ridgecv.coef_
def feature_ridge(self): model = RidgeCV() model.fit(self.x, self.y) coefficients = pd.Series(model.coef_, index=self.x.columns) print("Beta weights/co-efficients (L2 regularisation)") print("-----------------------------------------") print(coefficients) print('\n') print('R2 score is {}'.format(model.score(self.x, self.y)))
def linear_reg_single_meter(X_train, X_test, y_train, y_test): # Fit your model using the training set linear = LinearRegression() lasso_cv = LassoCV(cv=5, random_state=0) ridge_cv = RidgeCV(alphas=(0.1, 1.0, 10.0)) linear.fit(X_train, y_train) lasso_cv.fit(X_train, y_train) ridge_cv.fit(X_train, y_train) print("Variance Inflation Factors") print(vifs(X_test)) print('\n') print('Features') print('\n') print(list(X_test.columns)) print( 'Linear regression score on train set with all parameters: {}'.format( linear.score(X_train, y_train))) print('Linear regression score on test set with all parameters: {}'.format( linear.score(X_test, y_test))) # print('Linear regression crossVal score on train set with all parameters: {}'.format(linear.score(X_train, y_train))) # print('Linear regression crossVal score on test set with all parameters: {}'.format(linear.score(X_test, y_test))) print( 'LassoCV regression score on train set with all parameters: {}'.format( lasso_cv.score(X_train, y_train))) print( 'LassoCV regression score on test set with all parameters: {}'.format( lasso_cv.score(X_test, y_test))) # print('LassoCV regression crossVal score on train set with all parameters: {}'.format(lasso_cv.score(X_train, y_train))) # print('LassoCV regression crossVal score on test set with all parameters: {}'.format(lasso_cv.score(X_test, y_test))) print( 'RidgeCV regression score on train set with all parameters: {}'.format( ridge_cv.score(X_train, y_train))) print( 'RidgeCV regression score on test set with all parameters: {}'.format( ridge_cv.score(X_test, y_test))) # print('RidgeCV regression crossVal score on train set with all parameters: {}'.format(ridge_cv.score(X_train, y_train))) # print('RidgeCV regression crossVal score on test set with all parameters: {}'.format(ridge_cv.score(X_test, y_test))) return ridge_cv, lasso_cv, linear
def calculate_slopes(data, osc_idx, osc_idx_type, N): # Initialization of an array where values are saved slope = np.zeros((data.shape[0], osc_idx.shape[0])) pval = np.zeros((data.shape[0], osc_idx.shape[0])) rsquared = np.zeros((data.shape[0], 1)) # If, crop yield data exists, calculate the slope coefficient for each oscillation and FPU # using regularized ridge regression. for k in range(0, slope.shape[0]): y = data[k, :] if all(~np.isnan(y)): if 'multiv' in osc_idx_type: X0 = np.hstack( (osc_idx.T, np.ones((osc_idx[1, :][:, np.newaxis].shape)))) ridge_model = RidgeCV( alphas=[1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10]).fit( X0, y) coeffs = ridge_model.coef_ rsquared_temp = ridge_model.score(X0, y) coeffs_sample = np.empty((0, 4)) for j in range(0, N): rand_idx_data = np.random.choice(data.shape[1], data.shape[1], replace=True) y_sample = y[rand_idx_data] X0_sample = X0[rand_idx_data, :] ridge_model_sample = RidgeCV( alphas=[1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10 ]).fit(X0_sample, y_sample) coeffs_sample = np.vstack( (coeffs_sample, ridge_model_sample.coef_[np.newaxis, :])) # Calculate how many of the sampled slope values are # smaller or larger than zero. bstrp_test = np.zeros((2, coeffs.shape[0])) bstrp_test[0, :] = np.sum(coeffs_sample > 0, axis=0) bstrp_test[1, :] = np.sum(coeffs_sample < 0, axis=0) pvals = (1 - np.amax(bstrp_test, axis=0) / N) * 2.0 # Select the maximum of the array that stores the number of sampled correlations # larger or smaller than zero, and divide that with the total number of samples. # Then substract this value from 1, and multiply this value by two (two-sided test) to get the p-value. slope[k, :] = coeffs[0:3] pval[k, :] = pvals[0:3] rsquared[k, 0] = rsquared_temp else: slope[k, 0] = np.nan return slope, pval, rsquared
def ridge_regression(X_train, y_train, X_test, y_test, plot): """ Perfomring a ridge regression with built in CV and plotting the feature importance """ # Fit the ridge regression reg = RidgeCV() reg.fit(X_train, y_train) print("Best alpha using built-in RidgeCV: %f" % reg.alpha_) print("Best score using built-in RidgeCV: %f" % reg.score(X_train, y_train)) coef = pd.Series(reg.coef_, index=X_train.columns) print( "Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables" ) # Extract the feature importance imp_coef = coef.sort_values() # Plot the feature importance if plot: plt.rcParams["figure.figsize"] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Ridge Model") plt.show() # Visualizing the regression visualizer = ResidualsPlot(reg, size=(1080, 720)) visualizer.fit(X_train, y_train) # Fit the training data to the visualizer visualizer.score(X_test, y_test) # Evaluate the model on the test data visualizer.show() # Finalize and render the figure # Using the test data to calculate a score y_pred = reg.predict(X_test) # Return metrics return { "name": "Ridge Regression", "R squared": reg.score(X_test, y_test), "R squared training": reg.score(X_train, y_train), "RMSE": rmse(y_test, y_pred), "MAE": mean_absolute_error(y_test, y_pred), }
def scale_test_and_train_ridge(X, y): """ Run a ridge regression on the model """ X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3) X_train_scale = X_train.values X_val_scale = X_val.values X_test_scale = X_test.values scale = StandardScaler() X_train_scale = scale.fit_transform(X_train_scale) X_test_scale = scale.transform(X_test_scale) X_val_scale = scale.transform(X_val_scale) ridge = RidgeCV(cv=5) ridge.fit(X_train_scale, y_train) ridge.score(X_train_scale, y_train) y_pred = ridge.predict(X_val_scale) print(f'Ridge Regression val R^2: {ridge.score(X_val_scale, y_val):.3f}') print( f'Ridge Regression val RME: {sqrt(mean_squared_error(y_val,y_pred)):.3f}' ) return ridge.coef_
def doRidgeRegressionCV(X_train, y_train, X_test, y_test): print('Now doing Ridge cross validation') alpha_ridge = [0.001, 0.01, 0.1, 1, 10, 100, 500, 1000] rf = RidgeCV(alphas=alpha_ridge, store_cv_values=True).fit(X_train, y_train) print(rf.score(X_test, y_test)) cv_mse = np.mean(rf.cv_values_, axis=0) print("alphas: %s" % alpha_ridge) print("CV MSE: %s" % cv_mse) print("Best alpha using built-in RidgeCV: %f" % rf.alpha_) plt.plot(np.array(alpha_ridge, dtype="float64"), np.array(cv_mse, dtype="float64")[0]) plt.xlabel('Value of alpha for Ridge') plt.ylabel('Cross-validated MSE')
def build_ts_regression(self, feature_list, target, dt_index, model_list=['ridge']): """ Takes features and target Does train test split Reports on preformance Returns: model """ # Fix missing values print('\n[INFO] Imputing missing values') self.fix_missing() print('\nMissing Values:') print(bo.df.isna().sum()) test_size = 0.3 self.df.sort_values(dt_index, ascending=True, inplace=True) nrows = self.df.shape[0] train_idx = int(nrows*(1-test_size)) test_idx = nrows - train_idx X = self.df[feature_list] y = self.df[target] self.X_train = X.iloc[0:train_idx] self.X_test = X.iloc[0:test_idx] self.y_train = y.iloc[0:train_idx] self.y_test = y.iloc[0:test_idx] print('Xtrain size:', self.X_train.shape[0], 'Xtest size:', self.X_test.shape[0]) for m in model_list: if m == 'ridge': from sklearn.linear_model import RidgeCV # Choosing a CV number if self.df.shape[0] > 100: cv = 3 elif self.df.shape[0] > 500: cv = 5 else: cv = 1 model = RidgeCV(alphas=(0.1, 1.0, 10.0), cv=cv) model.fit(self.X_train, self.y_train) print('\nRidge Regression R-squared:', model.score(self.X_test, self.y_test)) # Add the model to the output list self.models.append(model)
def run(): # Data preprocessing train = DataPrep.prep_data(headless_run) # Scale data: https://scikit-learn.org/stable/modules/svm.html#tips-on-practical-use target = train.SalePrice train = train.drop(columns='SalePrice') X_train, X_test, y_train, y_test = train_test_split( train, target, test_size=0.25, random_state=0) # Trying L2 regularization clf = RidgeCV(cv=5).fit(X_train, y_train) # print(rmse_cv(clf).mean()) # Lasso gives us an alpha of 0.1231, picks some coefficients and gives the rest a 0 value coef = pd.Series(clf.coef_, index=X_train.columns) # Metrics variance_score = clf.score(X_test, y_test) MSEscore = mean_squared_error(clf.predict(X_test), y_test) MAEscore = median_absolute_error(clf.predict(X_test), y_test) R2score = r2_score(clf.predict(X_test), y_test) if not headless_run: print('Variance score: {}'.format(variance_score)) # print("CLF best: {}".format(clf.best_score_)) grid search only print('MSE score: {}'.format(MSEscore)) print('MAE score: {}'.format(MAEscore)) print('R2 score: {}'.format(R2score)) # Plotting Residuals plt.scatter(clf.predict(X_train), clf.predict(X_train) - y_train, color="green", s=10, label='Train data') plt.scatter(clf.predict(X_test), clf.predict(X_test) - y_test, color="blue", s=10, label='Test data') plt.hlines(y=0, xmin=10, xmax=14, linewidth=2) plt.legend(loc='upper right') plt.title("Residual errors") plt.show() else: return [variance_score,MSEscore,MAEscore,R2score]
def ridge_regression(y_train, x_train, df_test): ridge = RidgeCV( alphas=[0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60]) ridge.fit(x_train, y_train) alpha = ridge.alpha_ ridge = RidgeCV(alphas=[ alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4 ], cv=10) ridge.fit(x_train, y_train) alpha = ridge.alpha_ print('ALPHA', alpha) acc_log = round(ridge.score(x_train, y_train) * 100, 2) print('SCORE', acc_log) pred = ridge.predict(df_test) return pred
def ridge_boston(): boston = load_boston() x = boston.data y = boston.target train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=.25) std_s = StandardScaler() train_x = std_s.fit_transform(train_x) test_x = std_s.fit_transform(test_x) # ridge = Ridge(alpha=1.5) ridge = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1], cv=4) ridge.fit(train_x, train_y) score = ridge.score(test_x, test_y) predict_y = ridge.predict(test_x) print(score) print(predict_y[:20]) print(test_y[:20]) return None
class RidgeWithPost(BaseEstimator, TransformerMixin): def __init__(self, weight=1.0): self.ridge = RidgeCV(weight) def fit(self, X, y, sample_weight=None): self.ridge.fit(X, y) return self def predict(self, X): y = self.ridge.predict(X) ranged = np.empty(len(y)) for i in range(0, len(y)): if y[i] < 18: ranged[i] = 18 else: ranged[i] = y[i] return ranged def score(self, X, y, sample_weight=None): return self.ridge.score(X, y)
def ridge_reg(): from sklearn.linear_model import RidgeCV n_alphas = 100 alpha_vals = np.logspace(-1, 3, n_alphas) rr = RidgeCV(alphas=alpha_vals, cv=10) rr.fit(X_train_scaled, y_train) y_pred_train = rr.predict(X_train_scaled) #y_pred_train_round = np.round(y_pred_train) y_pred_test = rr.predict(X_test_scaled) #y_pred_test_round = np.round(y_pred_test) print(rr.alpha_) print(rr.score(X_test_scaled, y_test)) #plot_conf_mat(y_test, _pred_round) global metrics_ridge metrics_ridge = [ accuracy_score(y_test, np.round(y_pred_test)), mean_squared_error(y_test, y_pred_test), r2_score(y_test, y_pred_test) ] return scores_results(y_train, y_test, y_pred_train, y_pred_test)
def NLP(self): """Performs a linear regression on a TF-IDF matrix of property titles vs. revenue potential""" # load and vectorize titles corpus = self.comps['title'] target = self.comps['rev_pot'] vec = TfidfVectorizer(tokenizer=self.spacy_tokenizer, max_features=15, max_df=1.0, ngram_range=(1, 1)) matrix = vec.fit_transform(corpus) # perform ridge regression and return dataframe of coefficients ls = RidgeCV() ls.fit(matrix, target) coefficients_df = pd.DataFrame.from_dict( dict(zip(vec.get_feature_names(), ls.coef_)), orient='index').sort_values(by=0) score = ls.score(matrix, target) print(f"R^2 = {score: .3f} \n Alpha: {ls.alpha_ : .3f}") return coefficients_df
def eval_score(self, X, n): """ RidgeCV Parameters ------------- X: pandas dataframe n: train_test_splitの回数 Return ------------- score: average score """ scores = [] for _ in range(n): X_train, X_test, y_train, y_test = train_test_split(X, self.y, test_size=0.4) model = RidgeCV() model.fit(X_train, y_train) scores.append(model.score(X_test, y_test)) score = np.array(scores).mean() return score
def build_regression(self, feature_list, target, model_list=['ridge']): """ Takes features and target Does train test split Reports on preformance Returns: model """ # Fix missing values print('\n[INFO] Imputing missing values') self.fix_missing() print('\nMissing Values:') print(self.df.isna().sum()) seed = 4784 from sklearn.model_selection import train_test_split X = self.df[feature_list] y = self.df[target] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, random_state=seed) for m in model_list: if m == 'ridge': from sklearn.linear_model import RidgeCV # Choosing a CV number if self.df.shape[0] > 100: cv = 3 elif self.df.shape[0] > 500: cv = 5 else: cv = 1 model = RidgeCV(alphas=(0.1, 1.0, 10.0), cv=cv) model.fit(self.X_train, self.y_train) print('\nRidge Regression R-squared:', model.score(self.X_test, self.y_test)) # Add the model to the output list self.models.append(model)
def Model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) def evaluate(model, test_features, test_labels,train_features, train_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('Model Performance') print('Average Error: {:0.4f} degrees.'.format(np.mean(errors))) print('Accuracy = {:0.2f}%.'.format(accuracy)) print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train))) print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test))) return accuracy real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) """ . Ridge model """ ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(x_train, y_train) ridgecv_score = ridgecv.score(x_train, y_train) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) """ . Random Forest """ #train=train.drop(columns=['DateSold']) #test=test.drop(columns=['DateSold']) #X_train=train.drop(columns=['SalePrice']) #Y_train=train['SalePrice'] X_train=train_linear_fea Y_train=train_linear_tar x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train) #rf_random.fit(x_train_rf, y_train_rf) rf_random.best_params_ #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, # we can explicitly specify every combination of settings to try. param_grid = { 'bootstrap': [False], 'max_depth': [80, 90, 100, 110,120,130], 'max_features': [2, 3], 'min_samples_leaf': [1,2,3, 4], 'min_samples_split': [2,4,6,8, 10, 12], 'n_estimators': [600,700, 800, 900, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) #grid_search.fit(x_train, y_train) grid_search.fit(X_train, Y_train) grid_search.best_params_ best_random = grid_search.best_estimator_ start=time.time() best_random.fit(x_train_rf,y_train_rf) end=time.time() evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_rf_predict=best_random.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_rf_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_rf = importance_rf.iloc[:20,] plt.barh(importance_top20_rf.features, importance_top20_rf.imp) plt.xlabel('Feature Importance') test_prediction_rf=np.expm1(best_random.predict(test_linear)) """ . Xgboost """ learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
X = X[:-predPeriod] #re-sizing the features for training dataset.dropna(inplace=True) # get rid of naN for 'label' column # create label y = np.array(dataset['label']) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2, random_state=1) # use linearRegression as algrithm #clf = LinearRegression() clf = RidgeCV (alphas =[0.1, 0.5, 1, 10]) clf.fit(X_train, y_train) #start_time = time.time() y_pred = clf.predict(X_pred) #print time.time() - start_time accuracy = clf.score(X_test, y_test) # visualize Learning Curves #ML.ModelLearning(X, y) #ML.ModelComplexity(X_train, y_train) #Linear slope calculation #print clf.alpha_ #print clf #print clf.coef_ #print clf.intercept_ print 'predict accuracy is: {:0.2f}'.format(accuracy) # build a column in data for predict result data['predict/Adj Close'] = data['Adj Close'] # add column for predict value/Adj Close
# RidgeCV Regression with 10 fold cross-validation along alpha values of 0.1, 1 and 10 Ridge_CV = RidgeCV(alphas=(0.1, 1.0, 10.0), cv=10) Ridge_CV.fit(x_train, y_train) predicted_Ridge_CV = Ridge_CV.predict(x_test) plt.scatter(y_test, predicted_Ridge_CV) plt.plot([-1, 5], [-1, 5], "g--", lw=1, alpha=0.4) plt.xlabel("True prices (EUR)") plt.ylabel("Predicted prices (EUR)") plt.text( -1, 2.5, ' R-squared = {}'.format(round(float(Ridge_CV.score(x_test, y_test)), 2))) plt.text( -1, 3, ' MSE = {}'.format( round(float(mean_squared_error(y_test, predicted_Ridge_CV)), 2))) plt.title( 'Ridge (Alpha = {}) - Predicted prices (EUR) vs. True prices (EUR)'.format( Ridge_CV.alpha_)) plt.show() # 10 folds cross-validation along the previous Ridge regression ridge = Ridge(alpha=0.1) shuffle = KFold(n_splits=10, shuffle=True, random_state=0) cv_scores = cross_val_score(ridge, x, y, cv=shuffle) print(cv_scores) print(cv_scores.mean())
model1 = LinearRegression().fit(X_m1,y_m) print(f"R2 Score: {model1.score(X_m1,y_m)}") """## Regularization 1. Lasso 2. Ridge 3. ElasticNet ### Ridge """ # higher the alpha value, more restriction on the coefficients; # lower the alpha > more generalization, coefficients are barely rr = RidgeCV(cv=5,fit_intercept=False) rr.fit(X_m, y_m) rr.score(X_m,y_m) rr.alpha_ plt.plot(rr.coef_,alpha=0.7,marker='*',markersize=10,color='red',label=r'Ridge; $\alpha =10$') plt.grid(True) plt.xticks(range(0,28,1)) plt.legend() plt.show() """# Model Accuracy Metrics You must use the Mean Squared Error & Mean Absolute Error for your model evaluations. You may also include extra metrics for calculating the scores. """ def MSE(model_preds, ground_truths):
print("---------------------------------------------------------------------") #岭回归,参数估计,固定岭参数 X = dfx.iloc[:, 0:5] #print(X) y = dfy_scaled reg01 = Ridge(alpha=0.15).fit(X, y) print('Ridge(alpha=0.15) score:', reg01.score(X, y).round(5)) #0.98513 print('Ridge(alpha=0.15) coefficients:', reg01.coef_.round(5), '\n') #[-0.05087 0.54623 0.39501 -0.12857 -0.03614] print("---------------------------------------------------------------------") #岭回归,按 CV 标准自动选择岭参数 alphas = np.linspace(0.0001, 0.5, 1000) reg02 = RidgeCV(alphas).fit(X, y) print('RidgeCV score:', reg02.score(X, y).round(5)) print('RidgeCV alpha:', reg02.alpha_.round(5)) #0.33737 print('RidgeCV coefficients:', reg02.coef_.round(5), '\n') print("---------------------------------------------------------------------") #lasso求解 count = 0 lamb = 0.05 lasso_reg = Lasso(alpha=lamb) lasso_reg.fit(dfx, dfy) print('Lasso Intercept:', lasso_reg.intercept_) print('Lasso Coef:', '\n', lasso_reg.coef_) #print(type(lasso_reg.coef_)) for n in lasso_reg.coef_: if ((n > 1e-5) or (n < -1e-5)): count = count + 1
################## # CLASSIFICATION # ################## cv = 5 clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]) cv_outer = KFold(n_splits=10, shuffle=True, random_state=1) outer_results = list() outer_scores = list() for train_ix, test_ix in cv_outer.split(X): # split data X_train, X_test = X[train_ix, :], X[test_ix, :] y_train, y_test = y[train_ix], y[test_ix] outer_results.append(clf.fit(X_train, y_train)) outer_scores.append(clf.score(X_test, y_test)) ################# # PRINT RESULTS # ################# for i_split, results in enumerate(outer_results): coefs_abs = np.abs(results.coef_) IX_coefs = np.argsort(-coefs_abs) coef_abs_sorted = coefs_abs[IX_coefs] print('Split ', i_split + 1) print(results.alpha_) print(1 + IX_coefs[:10]) print(coef_abs_sorted[:10]) print(outer_scores) print(np.mean(outer_scores))
print(ridge) print("Percent variance explained: {0}".format(ridge.score(X_aging, y_aging))) print("Coefficients found: \n{0}\n".format(prettyprint(ridge.coef_, col, sort=True))) print("ORDINARY LEAST SQUARES") print(ols) print("Percent variance explained: {0}".format(ols.score(X_aging, y_aging))) print("Coefficients found: \n{0}\n".format(prettyprint(ols.coef_, col, sort=True))) print("WHOLE DATASET //////////////////////////") print("SUPER AGERS //////////////////////////") ridge = RidgeCV(alphas=alpha_params, cv=7, scoring=score) ridge.fit(X_sa, y_sa) ols = LinearRegression() ols.fit(X_sa, y_sa) print("RIDGE REGRESSION") print("Percent variance explained: {0}".format(ridge.score(X_sa, y_sa))) print("Coefficients found: \n{0}\n".format(prettyprint(ridge.coef_, col, sort=True))) print("ORDINARY LEAST SQUARES") print("Percent variance explained: {0}".format(ols.score(X_sa, y_sa))) print("Coefficients found: \n{0}\n".format(prettyprint(ols.coef_, col, sort=True))) print("SUPER AGERS //////////////////////////") print("MCIS //////////////////////////") ridge = RidgeCV(alphas=alpha_params, cv=7, scoring=score) ridge.fit(X_mci, y_mci) ols = LinearRegression() ols.fit(X_mci, y_mci) print("RIDGE REGRESSION") print("Percent variance explained: {0}".format(ridge.score(X_mci, y_mci))) print("Coefficients found: \n{0}\n".format(prettyprint(ridge.coef_, col, sort=True))) print("ORDINARY LEAST SQUARES")