def ridge_regressor(df): """ INPUT: Pandas dataframe OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients """ y = df.pop("price").values X = df.values feature_names = df.columns xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0) clf = Ridge(alpha=1.0) clf.fit(xtrain, ytrain) score = clf.score(xtest, ytest) feat_imps = clf.coef_ ypredict = clf.predict(xtest) mae = np.mean(np.absolute(ytest - ypredict)) mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest) return ( "R^2 is ", score, "RMSE is ", rmse, "MAE percent is ", mae_percent, "Feature coefficients are ", zip(feature_names, feat_imps), )
def compute_linear_model(mfs, measures): from sklearn.linear_model import Ridge from sklearn import linear_model # try different ones clf = Ridge(alpha = 1.0) #clf = RidgeCV(alphas=[0.1, 1.0, 10.0]) #clf = linear_model.LinearRegression() # explain fexp using BMD + the MFS data fexp = measures[:, measures.shape[1]-1] bmd = measures[:, 0] bmd = bmd.reshape((bmd.shape[0], 1)) #print "BMD: ", bmd #print "FEXP: ", fexp #print "MFS; ", mfs #PCA #from sklearn.decomposition import PCA #pca = PCA(n_components=12) #pca.fit(mfs) #mfs_pca = pca.transform(mfs) X = np.hstack((bmd, mfs)) clf.fit(X, fexp) # Results #print "Coefs:", clf.coef_ print "Score (R^2):", clf.score(X, fexp)
def ridgereg(a): print("Doing ridge regression") clf = Ridge(alpha=a) clf.fit(base_X, base_Y) print ("Score = %f" % clf.score(base_X, base_Y)) clf_pred = clf.predict(X_test) write_to_file("ridge.csv", clf_pred)
def comparaison_ridge_lasso(X,Y): X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed()) clf_lasso = Lasso(selection='random', random_state=random.seed()) clf_ridge = Ridge() clf_lasso.fit(X_train,Y_train) clf_ridge.fit(X_train,Y_train) score_lasso=clf_lasso.score(X_test,Y_test) score_ridge=clf_ridge.score(X_test,Y_test) print("Precision de Lasso={:3.2f}% \nPrecision de Ridge={:3.2f}%\n".format(score_lasso*100,score_ridge*100))
def test_huber_better_r2_score(): # Test that huber returns a better r2 score than non-outliers""" X, y = make_regression_with_outliers() huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=100) huber.fit(X, y) linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y mask = np.abs(linear_loss) < huber.epsilon * huber.scale_ huber_score = huber.score(X[mask], y[mask]) huber_outlier_score = huber.score(X[~mask], y[~mask]) # The Ridge regressor should be influenced by the outliers and hence # give a worse score on the non-outliers as compared to the huber regressor. ridge = Ridge(fit_intercept=True, alpha=0.01) ridge.fit(X, y) ridge_score = ridge.score(X[mask], y[mask]) ridge_outlier_score = ridge.score(X[~mask], y[~mask]) assert_greater(huber_score, ridge_score) # The huber model should also fit poorly on the outliers. assert_greater(ridge_outlier_score, huber_outlier_score)
def training_predict_ridge(df): results =[] #独立重复10次 for train,test in KFold(len(df),n_folds = 10,shuffle = True): para = process_ridge(df.T[train].T) clf = Ridge(alpha = para) clf.fit(df[predictors].T[train].T,df[target1].T[train].values.ravel()) sc = clf.score(df[predictors].T[test].T,df[target1].T[test].values.ravel()) results.append(sc) return results
def test_alpha_opti(X,Y,nb_tests): score_lasso=0 score_ridge=0 score_lasso_opti=0 score_ridge_opti=0 for i in range(0,nb_tests): X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed()) clf_lasso = Lasso(selection='random', random_state=random.seed()) clf_ridge = Ridge() clf_lasso.fit(X_train,Y_train) clf_ridge.fit(X_train,Y_train) score_lasso+=clf_lasso.score(X_test,Y_test) score_ridge+=clf_ridge.score(X_test,Y_test) clf_lasso_opti = Lasso(selection='random', random_state=random.seed(),alpha=0.1) clf_ridge_opti = Ridge(alpha=0.1) clf_lasso_opti.fit(X_train,Y_train) clf_ridge_opti.fit(X_train,Y_train) score_lasso_opti+=clf_lasso_opti.score(X_test,Y_test) score_ridge_opti+=clf_ridge_opti.score(X_test,Y_test) print("Lasso (opti - non-opti) : {:3.3f}%".format(100*(score_lasso_opti-score_lasso)/nb_tests)) print("Ridge (opti - non-opti) : {:3.3f}%".format(100*(score_ridge_opti-score_ridge)/nb_tests))
def _regression_surface( userdata, switch_indiceses, corpus, filename): """Analyze data and make plot of document position and length vs. labeling time. """ doclengths = [] positions = [] times = [] for user, data in userdata.items(): curdoclengths = _get_doclengths_for_user(userdata, user, corpus) switch_indices = switch_indiceses[user] user_times = _build_data_times(user, data) for i in range(1, len(switch_indices)): if switch_indices[i] - switch_indices[i-1] == 16: doclengths.extend( curdoclengths[switch_indices[i-1]:switch_indices[i]]) positions.extend(np.arange(1, 17)) times.extend( user_times[switch_indices[i-1]:switch_indices[i]]) doclengths = np.array(doclengths) positions = np.array(positions) times = np.array(times) model_inputs = np.stack((doclengths, positions), axis=-1) ridge_model = Ridge() ridge_model.fit(model_inputs, times) r2 = ridge_model.score(model_inputs, times) fig, axis = plt.subplots(1, 1) xdata = np.arange(1, 17) for doclength in [30, 50, 100, 200, 500, 1000]: inputs = np.stack((np.array([doclength]*len(xdata)), xdata), axis=-1) ydata = ridge_model.predict(inputs) axis.plot( xdata, ydata, linewidth=2, label=str(doclength)) # apparently, all of the lines go down by 6.02762577314 from first # labeling time to 16th # axis.annotate(str(ydata[0] - ydata[-1]), (xdata[-1], ydata[-1])) box = axis.get_position() axis.set_position([box.x0, box.y0, box.width * 0.8, box.height]) legend = axis.legend(loc='center left', bbox_to_anchor=(1, 0.5)) legend.set_title('Document length (in tokens)') axis.set_title('$R^2=$'+str(r2)) axis.set_xlabel('Document order') axis.set_ylabel('Time (seconds)') fig.savefig(filename, bbox_inches='tight')
def regress( X, y, iterations = 10 ): ridge_model = Ridge( alpha=.1).fit(X,y) print("within sample R^2: "+str(ridge_model.score(X,y))) print('\n') linear_scores = [] kernel_scores = [] for i in range(iterations): ( X_train, X_test, y_train, y_test ) = cross_validation.train_test_split( X, y, random_state=randint(0,100)) model = Ridge( alpha=10.0 ) model.fit(X_train,y_train) linear_scores.append(model.score(X_test,y_test)) print ( 'linear scores:\tmean = '+ str(np.average(linear_scores))+ '\tstd dev = '+ str(np.std(linear_scores)) )
def test_sag_regressor(): """tests if the sag regressor performs well""" xmin, xmax = -5, 5 n_samples = 20 tol = .001 max_iter = 20 alpha = 0.1 rng = np.random.RandomState(0) X = np.linspace(xmin, xmax, n_samples).reshape(n_samples, 1) # simple linear function without noise y = 0.5 * X.ravel() clf1 = Ridge(tol=tol, solver='sag', max_iter=max_iter, alpha=alpha * n_samples) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) score1 = clf1.score(X, y) score2 = clf2.score(X, y) assert_greater(score1, 0.99) assert_greater(score2, 0.99) # simple linear function with noise y = 0.5 * X.ravel() + rng.randn(n_samples, 1).ravel() clf1 = Ridge(tol=tol, solver='sag', max_iter=max_iter, alpha=alpha * n_samples) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) score1 = clf1.score(X, y) score2 = clf2.score(X, y) score2 = clf2.score(X, y) assert_greater(score1, 0.5) assert_greater(score2, 0.5)
def build_model(train_file, test_file, attr_file, model_out, predictions_out, algorithm='ridge'): classifiers = ['ridge', 'linear', 'lasso', 'rf', 'en'] if algorithm not in classifiers: raise NotImplementedError("only implemented algorithms: " + str(classifiers)) train_data = pd.read_pickle(train_file) attrs = read_attrs(attr_file) target_attr = attrs[0] usable_attrs = attrs[1:] if algorithm == 'ridge': clf = Ridge() elif algorithm == 'linear': clf = LinearRegression() elif algorithm == 'lasso': clf = Lasso() elif algorithm == 'en': clf = ElasticNet() else: clf = RandomForestRegressor() clf.fit(train_data[usable_attrs], train_data[target_attr]) test_data = pd.read_pickle(test_file) predictions = clf.predict(test_data[usable_attrs]) errors = predictions - test_data[target_attr] prediction_results = test_data[[target_attr] + usable_attrs].copy() prediction_results['predicted'] = predictions prediction_results.to_pickle(predictions_out) print "Modeling '%s'" % target_attr print " Train:", train_file, '(%d examples)' % len(train_data) print " Test:", test_file, '(%d examples)' % len(test_data) print "Algorithm:", algorithm if hasattr(clf, 'coef_'): print 'Coefficients:' for i,c in enumerate(clf.coef_): print ' %-20s' % usable_attrs[i] + ':', '%20.4f' % c print 'MSE : %10.4f' % np.mean(errors ** 2) print 'medSE: %10.4f' % np.median(errors ** 2) print 'SSE : %10.4f' % np.sum(errors ** 2) print 'Variance score: %.4f' % clf.score(test_data[usable_attrs], test_data[target_attr]) pickle.dump(clf, open(model_out, 'wb'))
def forward_selection(self, data, labels, weights, num_features): """Iteratively adds features to the model""" clf = Ridge(alpha=0, fit_intercept=True, random_state=self.random_state) used_features = [] for _ in range(min(num_features, data.shape[1])): max_ = -100000000 best = 0 for feature in range(data.shape[1]): if feature in used_features: continue clf.fit(data[:, used_features + [feature]], labels, sample_weight=weights) score = clf.score(data[:, used_features + [feature]], labels, sample_weight=weights) if score > max_: best = feature max_ = score used_features.append(best) return np.array(used_features)
def run_full_example(df, ridge_alpha=1.0, test_set_fraction=0.5): #convert Pandas DataFrame to a feature matrix X,y,col_names = data_frame_to_matrix(df, 'energy', ['weather']) #split into training and test sets Xtrain,Xtest,ytrain,ytest = train_test_split(X, y, test_size=test_set_fraction) print '# of training samples: {}'.format(len(ytrain)) print '# of test samples: {}'.format(len(ytest)) print 'alpha: {:.2f}'.format(ridge_alpha) print '' #create a Ridge object rr = Ridge(alpha=ridge_alpha) #fit the training data rr.fit(Xtrain, ytrain) #print out the weights and their names for weight,cname in zip(rr.coef_, col_names): print "{}: {:.6f}".format(cname, weight) print "Intercept: {:.6f}".format(rr.intercept_) print '' #compute the prediction on the test set ypred = rr.predict(Xtest) #compute the sum-of-squares error on the test set, which is #proportional to the log likelihood sqerr = np.sum((ytest - ypred)**2) / len(ytest) print 'Normalized Sum-of-squares Error: {:.3f}'.format(sqerr) #compute the sum-of-squares error for a model that is just #comprised of the mean on the training set sqerr_mean_only = np.sum((ytest - ytrain.mean())**2) / len(ytest) print 'Normalized Sum-of-squares Error for mean-only: {:.3f}'.format(sqerr_mean_only) #print out the R-squared on the test set r2 = rr.score(Xtest, ytest) print "R-squared: {:.2f}".format(r2) print ''
def _random_search(self, random_iter, x, y): # Default Values alpha = 1.0 best_score = -sys.maxint if random_iter > 0: sys.stdout.write("Do a random search %d times" % random_iter) param_dist = {"alpha": uniform(loc=0.0001, scale=10-0.0001)} param_list = [{"alpha": alpha}, ] param_list.extend(list(ParameterSampler(param_dist, n_iter=random_iter-1, random_state=self._rng))) for idx, d in enumerate(param_list): rr = Ridge(alpha=d["alpha"], fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto') train_x, test_x, train_y, test_y = \ train_test_split(x, y, test_size=0.5, random_state=self._rng) rr.fit(train_x, train_y) sc = rr.score(test_x, test_y) # Tiny output m = "." if idx % 10 == 0: m = "#" if sc > best_score: m = "<" best_score = sc alpha = d['alpha'] sys.stdout.write(m) sys.stdout.flush() sys.stdout.write("Using alpha: %f\n" % alpha) return alpha
def apply_ridge( X_train, Y_train, alpha=None ): alphas = [ alpha ] if not alpha: alphas = [ x for x in sorted(set([ alpha, 0.1, 1.0/3.0, 1.0, 10.0/3.0, 10.0 ])) if x] ALPHA_VALS = {} for a in alphas: model = Ridge(alpha=a, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto') # sample_weights = [ 1.0/float(len(Y)) for x in Y ] model.fit( X_train, Y_train )# , sample_weight=sample_weights) R2 = model.score(X_train, Y_train) L2 = dot(model.coef_,model.coef_) ALPHA_VALS [a ] = [ a, R2, L2, [x for x in model.coef_] ] print "ALPHA: %.2f \t R^2=%7.4f \t L2_NORM(THETA)=%10.2f \t THETA[1:N]=%s" % ( a, R2, L2, model.coef_ ) # A = sorted([ ALPHA_VALS[x] for x in ALPHA_VALS [ a, R2, L2, model.coef_[:] ], key=lambda x: x[1], reversed=True ) Theta = [ float( model.intercept_ ) , ] Theta.extend( [ float( x ) for x in model.coef_]) ( model, Theta, J, SCORE ) = performance_analysis( model, Theta, X_train, Y_train, debug=1 ) return ( model, Theta, J, SCORE )
def example4(): #generate the dataset df = generate_solar_data(num_samples=1000) #convert Pandas DataFrame to a feature matrix X,y,col_names = data_frame_to_matrix(df, 'energy', ['weather']) #split into training and test sets Xtrain,Xtest,ytrain,ytest = train_test_split(X, y, test_size=0.5) #create a Ridge object rr = Ridge() #fit the training data rr.fit(Xtrain, ytrain) #print out the weights and their names for weight,cname in zip(rr.coef_, col_names): print "{}: {:.6f}".format(cname, weight) print "Intercept: {:.6f}".format(rr.intercept_) #print out the R-squared on the test set r2 = rr.score(Xtest, ytest) print "R-squared: {:.2f}".format(r2)
print('Crime dataset') print('linear model intercept: {}'.format(linreg.intercept_)) print('linear model coeff:\n{}'.format(linreg.coef_)) print('R-squared score (training): {:.3f}'.format(linreg.score(X_train, y_train))) print('R-squared score (test): {:.3f}'.format(linreg.score(X_test, y_test))) # ridge regression approach -------------------------------------------------- X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state = 0) linridge = Ridge(alpha=20.0).fit(X_train, y_train) print('Crime dataset') print('ridge regression linear model intercept: {}'.format(linridge.intercept_)) print('ridge regression linear model coeff:\n{}'.format(linridge.coef_)) print('R-squared score (training): {:.3f}'.format(linridge.score(X_train, y_train))) print('R-squared score (test): {:.3f}'.format(linridge.score(X_test, y_test))) print('Number of non-zero features: {}'.format(np.sum(linridge.coef_ != 0))) # ridge regression with normalization approach -------------------------------- scaler = MinMaxScaler() X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state = 0) X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train) print('Crime dataset')
print("Test set score: {:.2f}".format(lr.score(X_test, y_test))) #Lets build a Linear regression on Boston dataset from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split X,y=mglearn.datasets.load_extended_boston() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) Linreg = LinearRegression() lr = Linreg.fit(X_train, y_train) print("Training set score: {:.2f}".format(lr.score(X_train, y_train))) print("Test set score: {:.2f}".format(lr.score(X_test, y_test))) # Ridge regression------------------------------------------------------------- from sklearn.linear_model import Ridge ridge=Ridge().fit(X_train,y_train) print('Training set score : {}'.format(ridge.score(X_train,y_train))) print('Test set score : {}'.format(ridge.score(X_test,y_test))) """ The Ridge model makes a trade-off between the simplicity of the model (near-zero coefficients) and its performance on the training set. How much importance the model places on simplicity versus training set performance can be specified by the user, using the alpha parameter. In the previous example, we used the default parameter alpha=1.0. There is no reason why this will give us the best trade-off, though. The optimum setting of alpha depends on the particular dataset we are using. Increasing alpha forces coefficients to move more toward zero, which decreases training set performance but might help generalization. For example """ ridge10=Ridge(alpha=10).fit(X_train,y_train) print('Training set score : {}'.format(ridge10.score(X_train,y_train))) print('Test set score : {}'.format(ridge10.score(X_test,y_test)))
y = np.array(y_list) # ============================================================================= # PERFORM ML PREDICTION np.random.seed(0) split_idxs = np.random.permutation(len(X)) # Split Data (Training Testing) X_train = X[split_idxs[:-500]] y_train = y[split_idxs[:-500]] X_test = X[split_idxs[-500:]] y_test = y[split_idxs[-500:]] # Ridge Regression Classification from sklearn.linear_model import Ridge clf = Ridge(alpha=1.0) clf.fit(X_train,y_train) predictions = clf.predict(X_test); print predictions print clf.score(X_test,y_test) text_file = open("Log.txt", "a") ts = time.time() text_file.write("Data collected from %s" % datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S\n')) text_file.write("Prediction Score: %f\n\n" % clf.score(X_test,y_test)) text_file.close()
avg_train_score = 0 avg_test_score = 0 target_data_file = "targets_%s.dat" % target print "Starting to train a model to predict %s..." % target.replace('_', ' ') target_matrix = cPickle.load(open('2013-04-20 183207/' + target_data_file, 'r')) print "Converting targets to CSR Matrix to make life easier..." target_matrix = np.array(target_matrix) kf = KFold(len(target_matrix), n_folds=3, indices=True, shuffle=True) for train_index, test_index in kf: print "Beginning Fold" kfold_train = feature_matrix[train_index] kfold_test = feature_matrix[test_index] kfold_train_target = target_matrix[train_index] kfold_test_target = target_matrix[test_index] #clf = SGDRegressor(n_iter=1000, shuffle=True) clf = Ridge() clf.fit(kfold_train, kfold_train_target) score_train = clf.score(kfold_train, kfold_train_target) score_test = clf.score(kfold_test, kfold_test_target) print "R^2 Score On Training Data:", score_train avg_train_score += score_train print "R^2 Score On Validation Data:", score_test avg_test_score += score_test avg_train_score = avg_train_score/3.0 avg_test_score = avg_test_score/3.0 print "Average Score on Training Data:", avg_train_score print "Average Score on Testing Data:", avg_test_score
cv=10) search.fit(Xs, ys) search.best_params_ # In[17]: ######Ridge X_train, X_test, y_train, y_test = train_test_split(Xs, ys, test_size=0.2, random_state=10) ridge = Ridge(alpha=1, normalize=False) ridge.fit(X_train, y_train) y_pred = ridge.predict(X_test) # # Compute and print R^2 and RMSE print("R^2: {}".format(ridge.score(X_test, y_test))) rmse = np.sqrt(mean_squared_error(y_test, y_pred)) print(" Test Root Mean Squared Error: {}".format(rmse)) # In[30]: y0_pred = ridge.predict(X_test) y1_pred = ridge.predict(X_train) # # Compute and print R^2 and RMSE print("R^2: {}".format(ridge.score(X_test, y_test))) rmse0 = np.sqrt(mean_squared_error(y_test, y0_pred)) rmse = np.sqrt(mean_squared_error(y_train, y1_pred)) print("Root Mean Squared Error for Test: {}".format(rmse0)) print("Root Mean Squared Error for Train: {}".format(rmse)) # In[164]:
class Regressor(): """ Wraps scikitlearn regressors. Parameters ---------- strategy : string, defaut = "LightGBM" (if installed else "XGBoost") The choice for the regressor. Available strategies = "LightGBM" (if installed), "XGBoost", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear" **params : parameters of the corresponding regressor. Examples : n_estimators, max_depth... """ def __init__(self, **params): if ("strategy" in params): self.__strategy = params["strategy"] else: if (lgbm_installed): self.__strategy = "LightGBM" else: self.__strategy = "XGBoost" self.__regress_params = {} self.__regressor = None self.__set_regressor(self.__strategy) self.__col = None self.set_params(**params) self.__fitOK = False def get_params(self, deep=True): params = {} params["strategy"] = self.__strategy params.update(self.__regress_params) return params def set_params(self, **params): self.__fitOK = False if 'strategy' in params.keys(): self.__set_regressor(params['strategy']) for k, v in self.__regress_params.items(): if k not in self.get_params().keys(): warnings.warn("Invalid parameter for regressor " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`regressor.get_params().keys()`") else: setattr(self.__regressor, k, v) for k, v in params.items(): if (k == "strategy"): pass else: if k not in self.__regressor.get_params().keys(): warnings.warn("Invalid parameter for regressor " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`regressor.get_params().keys()`") else: setattr(self.__regressor, k, v) self.__regress_params[k] = v def __set_regressor(self, strategy): self.__strategy = strategy if (strategy == 'RandomForest'): self.__regressor = RandomForestRegressor(n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif (strategy == 'XGBoost'): self.__regressor = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, colsample_bytree=0.8, colsample_bylevel=1., subsample=0.9, nthread=-1, seed=0) elif (strategy == "LightGBM"): if (lgbm_installed): self.__regressor = LGBMRegressor(n_estimators=500, learning_rate=0.05, colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0) else: warnings.warn( "Package lightgbm is not installed. Model LightGBM will be" "replaced by XGBoost") self.__strategy = "XGBoost" self.__regressor = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, colsample_bytree=0.8, colsample_bylevel=1., subsample=0.9, nthread=-1, seed=0) elif (strategy == 'ExtraTrees'): self.__regressor = ExtraTreesRegressor(n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif (strategy == 'Tree'): self.__regressor = DecisionTreeRegressor( criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, presort=False) elif (strategy == "Bagging"): self.__regressor = BaggingRegressor(base_estimator=None, n_estimators=500, max_samples=.9, max_features=.85, bootstrap=False, bootstrap_features=False, n_jobs=-1, random_state=0) elif (strategy == "AdaBoost"): self.__regressor = AdaBoostRegressor(base_estimator=None, n_estimators=400, learning_rate=.05, random_state=0) elif (strategy == "Linear"): self.__regressor = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=0) else: raise ValueError( "Strategy invalid. Please choose between 'LightGBM' " "(if installed), 'XGBoost', 'RandomForest', 'ExtraTrees', " "'Tree', 'Bagging', 'AdaBoost' or 'Linear'") def fit(self, df_train, y_train): """ Fits Regressor. Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features. y_train : pandas series of shape = (n_train, ) The target for regression tasks. Returns ------- self """ # sanity checks if ((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if (type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") self.__regressor.fit(df_train.values, y_train) self.__col = df_train.columns self.__fitOK = True return self def feature_importances(self): """ Computes feature importances. Regressor must be fitted before. Parameters ---------- None Returns ------- importance : dict Dictionnary containing a measure of feature importance (value) for each feature (key). """ if self.__fitOK: if (self.get_params()["strategy"] in ["Linear"]): importance = {} f = np.abs(self.get_estimator().coef_) for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in [ "LightGBM", "XGBoost", "RandomForest", "ExtraTrees", "Tree" ]): importance = {} f = self.get_estimator().feature_importances_ for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["AdaBoost"]): importance = {} norm = self.get_estimator().estimator_weights_.sum() try: # XGB, RF, ET, Tree and AdaBoost # TODO: Refactor this part f = sum( weight * est.feature_importances_ for weight, est in zip( self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa except Exception: f = sum(weight * np.abs(est.coef_) for weight, est in zip( self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["Bagging"]): importance = {} importance_bag = [] for i, b in enumerate(self.get_estimator().estimators_): d = {} try: # XGB, RF, ET, Tree and AdaBoost f = b.feature_importances_ except Exception: f = np.abs(b.coef_) # Linear estimator = self.get_estimator() items = enumerate(estimator.estimators_features_[i]) for j, c in items: d[self.__col[c]] = f[j] importance_bag.append(d.copy()) for i, col in enumerate(self.__col): importance[col] = np.mean( filter(lambda x: x != 0, [ k[col] if col in k else 0 for k in importance_bag ])) else: importance = {} return importance else: raise ValueError("You must call the fit function before !") def predict(self, df): ''' Predicts the target. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- y : array of shape = (n, ) The target to be predicted. ''' try: if not callable(getattr(self.__regressor, "predict")): raise ValueError("predict attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.predict(df.values) else: raise ValueError("You must call the fit function before !") def transform(self, df): ''' Transforms df. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- df_transform : pandas dataframe of shape = (n, n_selected_features) The transformed dataset with its most important features. ''' try: if not callable(getattr(self.__regressor, "transform")): raise ValueError("transform attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.transform(df.values) else: raise ValueError("You must call the fit function before !") def score(self, df, y, sample_weight=None): """ Returns the coefficient of determination R^2 of the prediction. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. y : pandas series of shape = (n,) The numerical encoded target for classification tasks. Returns ------- score : float R^2 of self.predict(df) wrt. y. """ try: if not callable(getattr(self.__regressor, "score")): raise ValueError("score attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") if (type(y) != pd.core.series.Series): raise ValueError("y must be a Series") return self.__regressor.score(df.values, y, sample_weight) else: raise ValueError("You must call the fit function before !") def get_estimator(self): return copy(self.__regressor)
# create and train a few models lr = LinearRegression(normalize=True) lr.fit(X_train, Y_train) lasso = Lasso(alpha=0.01) lasso.fit(X_train, Y_train) ridge = Ridge(alpha=0.1) ridge.fit(X_train, Y_train) rfr = RandomForestRegressor() rfr.fit(X_train, Y_train) mlp = MLPRegressor(hidden_layer_sizes=(200,), max_iter=1000) mlp.fit(X_train, Y_train) # print model accuracy and comparasion from sklearn.metrics import accuracy_score acc_lr = lr.score(X_test, Y_test) acc_lasso = lasso.score(X_test, Y_test) acc_ridge = ridge.score(X_test, Y_test) acc_rfr = rfr.score(X_test, Y_test) acc_mlp = mlp.score(X_test, Y_test) print "LinearRegression: ", acc_lr print "Lasso: ", acc_lasso print "Ridge: ", acc_ridge print "RandomForestRegressor: ", acc_rfr print "MLPRegressor: ", acc_mlp
Y, test_size=0.3, random_state=3) print len(X_test), len(y_test) lr = LinearRegression() lr.fit(X_train, y_train) rr = Ridge( alpha=0.01 ) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely # restricted and in this case linear and ridge regression resembles rr.fit(X_train, y_train) rr100 = Ridge(alpha=100) # comparison with alpha value rr100.fit(X_train, y_train) train_score = lr.score(X_train, y_train) test_score = lr.score(X_test, y_test) Ridge_train_score = rr.score(X_train, y_train) Ridge_test_score = rr.score(X_test, y_test) Ridge_train_score100 = rr100.score(X_train, y_train) Ridge_test_score100 = rr100.score(X_test, y_test) print "linear regression train score:", train_score print "linear regression test score:", test_score print "ridge regression train score low alpha:", Ridge_train_score print "ridge regression test score low alpha:", Ridge_test_score print "ridge regression train score high alpha:", Ridge_train_score100 print "ridge regression test score high alpha:", Ridge_test_score100 # plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers # plt.plot(rr100.coef_,alpha=0.5,linestyle='none',marker='d',markersize=6,color='blue',label=r'Ridge; $\alpha = 100$') # alpha here is for transparency # plt.plot(lr.coef_,alpha=0.4,linestyle='none',marker='o',markersize=7,color='green',label='Linear Regression') # plt.xlabel('Coefficient Index',fontsize=16) # plt.ylabel('Coefficient Magnitude',fontsize=16) # plt.legend(fontsize=13,loc=4)
# Code starts here lasso = Lasso() lasso.fit(X_train, y_train) lasso_pred = lasso.predict(X_test) r2_lasso = lasso.score(X_test, y_test) print(r2_lasso) # -------------- from sklearn.linear_model import Ridge # Code starts here ridge = Ridge() ridge.fit(X_train, y_train) ridge_pred = ridge.predict(X_test) r2_ridge = ridge.score(X_test, y_test) print(r2_ridge) # Code ends here # -------------- from sklearn.model_selection import cross_val_score #Code starts here regressor = LinearRegression() # Initiate cross validation score score = cross_val_score(regressor, X_train, y_train, scoring='r2', cv=10) print(score) #calculate mean of the score mean_score = np.mean(score)
def create_model(df, y, X, X_train, X_test, y_train, y_test, degree, random_state, test_size, alpha): linreg = LinearRegression() linreg.fit(X_train, y_train) ss = StandardScaler() ss.fit(X_train) X_train_scaled = ss.transform(X_train) X_test_scaled = ss.transform(X_test) linreg_norm = LinearRegression() linreg_norm.fit(X_train_scaled, y_train) X_cat = df[['Month', 'Origin', 'Dest']] X_train_cat, X_test_cat, y_train, y_test = train_test_split( X_cat, y, test_size=test_size, random_state=random_state) # OneHotEncode Categorical variables ohe = OneHotEncoder(handle_unknown='ignore') ohe.fit(X_train_cat) X_train_ohe = ohe.transform(X_train_cat) X_test_ohe = ohe.transform(X_test_cat) columns = ohe.get_feature_names(input_features=X_train_cat.columns) cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns) cat_test_df = pd.DataFrame(X_test_ohe.todense(), columns=columns) X_train_all = pd.concat([pd.DataFrame(X_train_scaled), cat_train_df], axis=1) X_test_all = pd.concat([pd.DataFrame(X_test_scaled), cat_test_df], axis=1) linreg_all = LinearRegression() linreg_all.fit(X_train_all, y_train) print('Baseline model Continuous and Categorical') print('Training r^2:', linreg_all.score(X_train_all, y_train)) print('Testing r^2:', linreg_all.score(X_test_all, y_test)) print('Training MSE:', mean_squared_error(y_train, linreg_all.predict(X_train_all))) print('Testing MSE:', mean_squared_error(y_test, linreg_all.predict(X_test_all))) print("\n") lasso = Lasso(alpha=alpha) #Lasso is also known as the L1 norm. lasso.fit(X_train_all, y_train) print('Lasso') print('Training r^2:', lasso.score(X_train_all, y_train)) print('Testing r^2:', lasso.score(X_test_all, y_test)) print('Training MSE:', mean_squared_error(y_train, lasso.predict(X_train_all))) print('Testing MSE:', mean_squared_error(y_test, lasso.predict(X_test_all))) print("\n") ridge = Ridge(alpha=alpha) #Ridge is also known as the L2 norm. ridge.fit(X_train_all, y_train) print('Ridge') print('Training r^2:', ridge.score(X_train_all, y_train)) print('Testing r^2:', ridge.score(X_test_all, y_test)) print('Training MSE:', mean_squared_error(y_train, ridge.predict(X_train_all))) print('Testing MSE:', mean_squared_error(y_test, ridge.predict(X_test_all))) print("\n") poly_features = PolynomialFeatures(degree) # transforms the existing features to higher degree features. X_train_poly = poly_features.fit_transform(X_train) # fit the transformed features to Linear Regression poly_model = LinearRegression() poly_model.fit(X_train_poly, y_train) # predicting on training data-set y_train_predicted = poly_model.predict(X_train_poly) # predicting on test data-set y_test_predict = poly_model.predict(poly_features.fit_transform(X_test)) # evaluating the model on training dataset rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted)) r2_train = r2_score(y_train, y_train_predicted) # evaluating the model on test dataset rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predict)) r2_test = r2_score(y_test, y_test_predict) print("\n") print(" Polynomial training set") print("MSE of training set is {}".format(rmse_train)) print("R2 score of training set is {}".format(r2_train)) print("\n") print("Polynomial test set") print("MSE of test set is {}".format(rmse_test)) print("R2 score of test set is {}".format(r2_test)) print("\n") print('Cross Validation for Polynomial model') lm = LinearRegression() # store scores in scores object # we can't use accuracy as our evaluation metric since that's only relevant for classification problems # RMSE is not directly available so we will use MSE scores = cross_val_score(lm, X_train_poly, y_train, cv=10, scoring='r2') mse_scores = cross_val_score(lm, X_train_poly, y_train, cv=10, scoring='neg_mean_squared_error') print('Cross Validation Mean r2:', np.mean(scores)) print('Cross Validation Mean MSE:', np.mean(mse_scores)) print('Cross Validation 10 Fold Score:', scores) print('Cross Validation 10 Fold mean squared error', -(mse_scores))
y = boston.target x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=66, shuffle=True, test_size=0.2) from sklearn.linear_model import LinearRegression, Ridge, Lasso # 모델 model1 = LinearRegression() model2 = Ridge() model3 = Lasso() model1.fit(x_train, y_train) model2.fit(x_train, y_train) model3.fit(x_train, y_train) linear_score = model1.score(x_test, y_test) ridge_score = model2.score(x_test, y_test) lasso_score = model3.score(x_test, y_test) # 평가 print('linear_score: ', linear_score) print('ridge_score: ', ridge_score) print('lasso_score: ', lasso_score) # y_pred = model1.predict(x_test) # print(y_pred)
################################################## RIDGE REGRESSION # PARAMETER TUNING features = ['c1','c2','c3','c4','c5','c6','c7','c8'] msk = np.random.rand(len(tf)) < 0.8 train = tf[msk].reset_index(drop=True) test = tf[~msk].reset_index(drop=True) row_list = [] for n in range(0,1001): clf = Ridge(alpha=n) clf.fit(train[features],train.nrtg) score = clf.score(test[features],test.nrtg) dict1 = {'alpha':n,'score':score} row_list.append(dict1) alpha_df = pd.DataFrame(row_list) alpha = alpha_df[alpha_df.score == alpha_df.score.max()].alpha.values[0] # RIDGE REGRESSION clf = Ridge(alpha=alpha) clf.fit(tf[features],tf.nrtg) coefficients = clf.coef_
from sklearn.linear_model import Ridge from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import mglearn X, y = mglearn.datasets.load_extended_boston() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) ridge = Ridge().fit(X_train, y_train) print("[default value of alpha]") print("training set score: %f" % ridge.score(X_train, y_train)) print("test set score: %f" % ridge.score(X_test, y_test)) # Model with high value of alpha (regularization parameter) ridge10 = Ridge(alpha=10).fit(X_train, y_train) print("[alpha 10]") print("training set score: %f" % ridge10.score(X_train, y_train)) print("test set score: %f" % ridge10.score(X_test, y_test)) # Model with low value of alpha ridge01 = Ridge(alpha=0.1).fit(X_train, y_train) print("[alpha 0.1]") print("training set score: %f" % ridge01.score(X_train, y_train)) print("test set score: %f" % ridge01.score(X_test, y_test)) plt.title("ridge_coefficients") plt.plot(ridge.coef_, 'o', label="Ridge alpha=1") plt.plot(ridge10.coef_, 'o', label="Ridge alpha=10") plt.plot(ridge01.coef_, 'o', label="Ridge alpha=0.1") plt.ylim(-25, 25) plt.legend() plt.show()
pdx = wine_quality[all_colnms] pdy = wine_quality["quality"] x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42) alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0] initrsq = 0 print ("\nRidge Regression: Best Parameters\n") for alph in alphas: ridge_reg = Ridge(alpha=alph) ridge_reg.fit(x_train,y_train) tr_rsqrd = ridge_reg.score(x_train,y_train) ts_rsqrd = ridge_reg.score(x_test,y_test) if ts_rsqrd > initrsq: print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5)) initrsq = ts_rsqrd # Coeffients of Ridge regression of best alpha value ridge_reg = Ridge(alpha=0.001) ridge_reg.fit(x_train,y_train) print ("\nRidge Regression coefficient values of Alpha = 0.001\n") for i in range(11): print (all_colnms[i],": ",ridge_reg.coef_[i])
#plt.show() data=pd.read_csv('ridge.csv') #绘制车流量信息 plt.plot(data['TRAFFIC_COUNT']) plt.show() X=data[data.columns[1:5]]#属性数据 y=data['TRAFFIC_COUNT']#车流量数据(即是要预测的数据) poly=PolynomialFeatures(5)#测试后5是效果较好的一个参数 #X为创建的多项式特征 X=poly.fit_transform(X) #将所有数据划分为训练集和测试集,test_size表示测试集的比例,random_state是随机数种子 train_set_X, test_set_X , train_set_y, test_set_y = cross_validation.train_test_split(X,y,test_size=0.3,random_state=0) #创建岭回归实例 clf=Ridge(alpha=1.0,fit_intercept = True) #调用fit函数使用训练集训练回归器 clf.fit(train_set_X,train_set_y) #利用测试集计算回归曲线的拟合优度,clf.score返回值为0.7375 #拟合优度,用于评价拟合好坏,最大为1,无最小值,当对所有输入都输出同一个值时,拟合优度为0。 clf.score(test_set_X,test_set_y) start=200 #接下来我们画一段200到300范围内的拟合曲线 end=300 y_pre=clf.predict(X) #是调用predict函数的拟合值 time=np.arange(start,end) plt.plot(time,y[start:end],'b', label="real") plt.plot(time,y_pre[start:end],'r', label='predict') #展示真实数据(蓝色)以及拟合的曲线(红色) plt.legend(loc='upper left') #设置图例的位置 plt.show()
# Initialize scikit-learn ridge regression model model_ridge_scikit = RidgeRegression(alpha=alpha) # Trains scikit-learn ridge regression model model_ridge_scikit.fit(x_poly_train, y_train) print('Results for scikit-learn RidgeRegression model with alpha={}'. format(alpha)) # Test model on training set score_mse_ridge_scikit_train = score_mean_squared_error( model_ridge_scikit, x_poly_train, y_train) print('Training set mean squared error: {:.4f}'.format( score_mse_ridge_scikit_train)) score_r2_ridge_scikit_train = model_ridge_scikit.score( x_poly_train, y_train) print('Training set r-squared scores: {:.4f}'.format( score_r2_ridge_scikit_train)) # Save MSE and R-squared training scores scores_mse_ridge_scikit_train.append(score_mse_ridge_scikit_train) scores_r2_ridge_scikit_train.append(score_r2_ridge_scikit_train) # Test model on validation set score_mse_ridge_scikit_val = score_mean_squared_error( model_ridge_scikit, x_poly_val, y_val) print('Validation set mean squared error: {:.4f}'.format( score_mse_ridge_scikit_val)) score_r2_ridge_scikit_val = model_ridge_scikit.score(x_poly_val, y_val) print('Validation set r-squared scores: {:.4f}'.format(
print ("Linear regression (order 5) score is: {0}".format(lr_5_model.score(X_test_poly, y_test))) plt.plot(xx, yy_poly) plt.plot(X_test, y_test, "o") plt.ylim([0, 30]) plt.title("Linear regression (order 5) result") plt.show() ridge_model = Ridge(alpha=1, normalize=False) ridge_model.fit(X_train_poly, y_train) yy_ridge = ridge_model.predict(xx_poly) # Todo: write to report print ("Ridge regression (order 5) score is: {0}".format(ridge_model.score(X_test_poly, y_test))) print ("y2= {0} + {1} x + {2} x*x + {3} x*x*x + {4} x*x*x*x +{5} x*x*x*x*x". format(ridge_model.intercept_[0], ridge_model.coef_[0][0], ridge_model.coef_[0][1], ridge_model.coef_[0][2], ridge_model.coef_[0][3], ridge_model.coef_[0][4])) plt.plot(xx, yy_ridge) plt.plot(X_test, y_test, "o") plt.ylim([0, 30]) plt.title("Ridge regression (order 5) result") plt.show() # Compare # 1. The model with the highest score is: Ridge model (order 5) # 2. Ridge model can prevent over-fitting: yes # 3. Ridge model is nearly equivalent to LR model (order 5) if alpha=0: yes # 4. A larger alpha results in a larger coefficient for x*x*x*x*x: no
linreg.score(X_train, y_train))) print("R-Squared Value for Test Set: {:.3f}".format( linreg.score(X_test, y_test))) # KNeighborsRegressor knnreg = KNeighborsRegressor(n_neighbors=2) knnreg.fit(X_train, y_train) print('R-squared train score: {:.3f}'.format(knnreg.score(X_train, y_train))) print('R-squared test score: {:.3f}'.format(knnreg.score(X_test, y_test))) # Ridge ridge = Ridge() ridge.fit(X_train, y_train) print('R-squared score (training): {:.3f}'.format(ridge.score( X_train, y_train))) print('R-squared score (test): {:.3f}'.format(ridge.score(X_test, y_test))) # Lasso lasso = Lasso(max_iter=10000) lasso.fit(X_train, y_train) print('R-squared score (training): {:.3f}'.format(lasso.score( X_train, y_train))) print('R-squared score (test): {:.3f}'.format(lasso.score(X_test, y_test))) lasso = Lasso(alpha=100, max_iter=10000) lasso.fit(train_processed, train['revenue']) results = lasso.predict(test_processed) results_2 = np.exp(results) print(results_2)
''' В гребневой регрессии коэффициенты (w) выбираются не только с точки зрения того, насколько хорошо они позволяют предсказывать на обучающих данных, они еще подгоняются в соответствии с дополнительным ограничением. Нам нужно, чтобы величина коэффициентов была как можно меньше. Другими словами, все элементы w должны быть близки к нулю. Это означает, что каждый признак должен иметь как можно меньшее влияние на результат ''' from sklearn.linear_model import Ridge ridge = Ridge().fit(X_train, y_train) print("Правильность на обучающем наборе: {:.2f}".format(ridge.score(X_train, y_train))) print("Правильность на тестовом наборе: {:.2f}".format(ridge.score(X_test, y_test))) ''' Увеличение alpha заставляет коэффициенты сжиматься до близких к нулю значений, что снижает качество работы модели на обучающем наборе, но может улучшить ее обобщающую способность. ''' ridge10 = Ridge(alpha=10).fit(X_train, y_train) print("Правильность на обучающем наборе: {:.2f}".format(ridge10.score(X_train, y_train))) print("Правильность на тестовом наборе: {:.2f}".format(ridge10.score(X_test, y_test))) #При очень малых значениях alpha, ограничение на коэффициенты практически не накладывается #и мы в конечном итоге получаем модель, напоминающую линейную регрессию ridge01 = Ridge(alpha=0.1).fit(X_train, y_train) print("Правильность на обучающем наборе: {:.2f}".format(ridge01.score(X_train, y_train)))
#Evaluate the model plt.figure(figsize=(15, 10)) ft_importances_lm.plot(kind='barh') plt.show() #R2 Value print("RSquare Value for Simple Regresssion TEST data is-") print(np.round(lm.score(features_test, labels_test) * 100, 2)) print("RSquare Value for Lasso Regresssion TEST data is-") print(np.round(lm_lasso.score(features_test, labels_test) * 100, 2)) print("RSquare Value for Ridge Regresssion TEST data is-") print(np.round(lm_ridge.score(features_test, labels_test) * 100, 2)) print("RSquare Value for Elastic Net Regresssion TEST data is-") print(np.round(lm_elastic.score(features_test, labels_test) * 100, 2)) #Predict on test and training data predict_test_lm = lm.predict(features_test) predict_test_lasso = lm_lasso.predict(features_test) predict_test_ridge = lm_ridge.predict(features_test) predict_test_elastic = lm_elastic.predict(features_test) #Print the Loss Funtion - MSE & MAE import numpy as np from sklearn import metrics
def prediction_ridge (X_train, Y_train, X_test, Y_test,alpha,normalize): # Print shapes of the training and testing data sets #print ("Shapes of the training and testing data sets") #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape) #Create our regression object lreg = Ridge (alpha = alpha,normalize=normalize) #do a linear regression, except only on the training lreg.fit(X_train,Y_train) #print("The estimated intercept coefficient is %.2f " %lreg.intercept_) #print("The number of coefficients used was %d " % len(lreg.coef_)) # Set a DataFrame from the Facts coeff_df = DataFrame(X_train.columns) coeff_df.columns = ["Fact"] # Set a new column lining up the coefficients from the linear regression coeff_df["Coefficient"] = pd.Series(lreg.coef_) # Show #coeff_df #highest correlation between a fact and fraction votes #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) ) #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter") #Predictions on training and testing sets pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) # The mean square error #print("Fit a model X_train, and calculate MSE with Y_train: %.6f" % np.mean((Y_train - pred_train) ** 2)) #print("Fit a model X_train, and calculate MSE with X_test and Y_test: %.6f" %np.mean((Y_test - pred_test) ** 2)) #Explained variance score: 1 is perfect prediction #print("Variance score: %.2f" % lreg.score(X_test, Y_test)) result={} result["method"]="Ridge %.3f " %alpha if normalize : result["normalize"]="Y" else: result["normalize"]="N" result["X_train_shape"]=X_train.shape result["Y_train_shape"]=Y_train.shape result["X_test_shape"]=X_test.shape result["Y_test_shape"]=Y_test.shape result["intercept"]=lreg.intercept_ result["num_coef"]=len(lreg.coef_) result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"] result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"] result["MSE_train"]=np.mean((Y_train - pred_train) ** 2) result["MSE_test"]=np.mean((Y_test - pred_test) ** 2) result["variance"]=lreg.score(X_test, Y_test) return pred_test,coeff_df,pred_train,result
def load_extended_boston(): boston = load_boston() X = MinMaxScaler().fit_transform(boston.data) X = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X) return X, boston.target X, y = load_extended_boston() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) from sklearn.linear_model import Ridge ridge = Ridge().fit(X_train, y_train) print("train accuracy: {:.2f}".format(ridge.score(X_train, y_train))) print("test accuracy: {:.2f}".format(ridge.score(X_test, y_test))) ridge10 = Ridge(alpha=10).fit(X_train, y_train) print("train accuracy: {:.2f}".format(ridge10.score(X_train, y_train))) print("test accuracy: {:.2f}".format(ridge10.score(X_test, y_test))) ridge01 = Ridge(alpha=0.1).fit(X_train, y_train) print("train accuracy: {:.2f}".format(ridge01.score(X_train, y_train))) print("test accuracy: {:.2f}".format(ridge01.score(X_test, y_test))) lr = LinearRegression().fit(X_train, y_train) plt.plot(ridge10.coef_, '^', label="Ridge alpha=10") plt.plot(ridge.coef_, 's', label="Ridge alpha=1") plt.plot(ridge01.coef_, 'v', label="Ridge alpha=0.1")
from sklearn.linear_model import Ridge ridge = Ridge() ridge.fit(X_train, y_train) # In[12]: pred_test = ridge.predict(X_test) pred_test # In[13]: ridge.score(X_test, y_test) # In[14]: #MSE from sklearn.metrics import mean_squared_error mean_squared_error(y_test, pred_test) # In[16]: #RandomForestRegresion from sklearn.ensemble import RandomForestRegressor
x_train, x_test, y_train, y_test = train_test_split(sal_munged, y, test_size=0.3) x_train = x_train.reshape(-1, x_train.shape[1]) regr = Ridge().fit(x_train, y_train) ### MODEL PERFORMANCE ### # The Mean Squared Error print("Mean Squared Error, training data: %d" % np.mean((regr.predict(x_train) - y_train) ** 2)) print("Mean Squared Error, test data: %d" % np.mean((regr.predict(x_test) - y_test) ** 2)) print(30 * "* ") # Variance score print("Variance score, training data: %.2f" % regr.score(x_train, y_train)) print("Variance score, test data: %.2f" % regr.score(x_test, y_test)) print(30 * "* ") ### GRAPHS: DISTRIBUTION OF ERROR ### print("Distribution of prediction error on training data:") predError = regr.predict(x_train) - y_train plt.hist(predError) plt.xlim(-80000, 80000) plt.show() print("Distribution of prediction error on test data:") predError = regr.predict(x_test) - y_test plt.hist(predError) plt.xlim(-80000, 80000) plt.show()
def TrainModel(self): self.browser.clear() X_train, X_test, y_train, y_test = self.X_train, self.X_test, self.y_train, self.y_test X_train1, X_test1, y_train1, y_test1 = X_train.values, X_test.values, y_train.values, y_test.values y_train2 = y_train1.reshape(-1, 1) y_test2 = y_test1.reshape(-1, 1) scalerX = preprocessing.StandardScaler().fit(X_train1) scalery = preprocessing.StandardScaler().fit(y_train2) X_train3 = scalerX.transform(X_train1) X_test3 = scalerX.transform(X_test1) y_train3 = scalery.transform(y_train2) y_test3 = scalery.transform(y_test2) self.browser.append("Load Dataset") self.browser.append("") self.browser.append("") # LinearRegression Model lm = LinearRegression() lm.fit(X_train, y_train) y_pred_lm = lm.predict(X_test) acc_lm_train = round(lm.score(X_train, y_train) * 100, 2) acc_lm_test = round(lm.score(X_test, y_test) * 100, 2) self.browser.append("<LinearRegression Model>") self.browser.append("Train acc : " + str(acc_lm_train) + "%") self.browser.append("Test acc : " + str(acc_lm_test) + "%") self.browser.append("") #time.sleep(3) # Ridge Regression Model ridge = Ridge(alpha=0.1) ridge.fit(X_train, y_train) y_pred_ridge = ridge.predict(X_test) acc_ridge_train = round(ridge.score(X_train, y_train) * 100, 2) acc_ridge_test = round(ridge.score(X_test, y_test) * 100, 2) self.browser.append("<Ridge Regression Model>") self.browser.append("Train acc : " + str(acc_ridge_train) + "%") self.browser.append("Test acc : " + str(acc_ridge_test) + "%") self.browser.append("Used Coefficient : " + str(np.sum(ridge.coef_ != 0))) self.browser.append("") #time.sleep(3) # Lasso Regression Model lasso = Lasso(alpha=0.1, max_iter=100000) lasso.fit(X_train, y_train) y_pred_lasso = lasso.predict(X_test) acc_lasso_train = round(lasso.score(X_train, y_train) * 100, 2) acc_lasso_test = round(lasso.score(X_test, y_test) * 100, 2) self.browser.append("<Lasso Regression Model>") self.browser.append("Train acc : " + str(acc_lasso_train) + "%") self.browser.append("Test acc : " + str(acc_lasso_test) + "%") self.browser.append("Used Coefficient : " + str(np.sum(lasso.coef_ != 0))) self.browser.append("") # SGD Regression sgd = SGDRegressor(loss="squared_loss", penalty=None, random_state=42, max_iter=100000) sgd.fit(X_train3, y_train3) y_pred_sgd = sgd.predict(X_test3) acc_sgd_train = round(sgd.score(X_train3, y_train3) * 100, 2) acc_sgd_test = round(sgd.score(X_test3, y_test3) * 100, 2) self.browser.append("<Stochastic Gradient Descent Regression>") self.browser.append("Train acc : " + str(acc_sgd_train) + "%") self.browser.append("Test acc : " + str(acc_sgd_test) + "%") self.browser.append("") # Decision Tree's etr = ExtraTreesRegressor() etr.fit(X_train, y_train) y_pred_etr = etr.predict(X_test) acc_etr_train = round(etr.score(X_train, y_train) * 100, 2) acc_etr_test = round(etr.score(X_test, y_test) * 100, 2) self.browser.append("<Extra Trees Regressor(Random Forest)>") self.browser.append("Train acc : " + str(acc_etr_train) + "%") self.browser.append("Test acc : " + str(acc_etr_test) + "%") self.browser.append("") #SVR svr = SVR() svr.fit(X_train3, y_train3) y_pred_svr = svr.predict(X_test3) acc_svr_train = round(svr.score(X_train3, y_train3) * 100, 2) acc_svr_test = round(svr.score(X_test3, y_test3) * 100, 2) self.browser.append("<Support Vector Machine>") self.browser.append("Train acc : " + str(acc_svr_train) + "%") self.browser.append("Test acc : " + str(acc_svr_test) + "%") self.browser.append("") models = pd.DataFrame({ 'Model': [ 'LinearRegression', 'Ridge Regression', 'Lasso Regression', 'SGD Regression', 'Extra Trees Regressor', 'Support Vector Machine' ], 'Score': [ acc_lm_test, acc_ridge_test, acc_lasso_test, acc_sgd_test, acc_etr_test, acc_svr_test ] }) models.sort_values(by='Score', ascending=True) models = PandasModelTrainData(models) self.tableView = QTableView() self.tableView.setSortingEnabled(True) self.tableView.setModel(models) self.tableView.setGeometry(850, 100, 320, 400) self.tableView.setColumnWidth(0, 200) self.tableView.sortByColumn(1, Qt.DescendingOrder) self.tableView.setWindowTitle("Accuracy") self.tableView.show()
def ridge_reg(X,Y,data_file,p=False): """ Does ridge regression on the data provided Inputs ------ X : Coulumns of the pandas dataframe that contains the data for each of the descriptors to be used Y : Column of the pandas dataframe that contains the values to be predicted data_file : String containing the name of the file the model statistics will be stored in, where the RMSE and R-Squared values for each model will be stored Outputs ------- coefs : Contains a list of the coefficient for each descriptor used """ X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=3) high_score = 0 alpha_ = 0 #coefs = np.zeros(19) rr0001 = Ridge(alpha=0.001) rr0001.fit(X_train, y_train) Ridge_train_score0001 = rr0001.score(X_train,y_train) Ridge_test_score0001 = rr0001.score(X_test, y_test) high_score = Ridge_test_score0001 alpha_ = 0.001 coefs = rr0001.coef_ pred = rr0001.predict(X_test) rmse = np.sqrt(MSE(y_test, pred)) rr001 = Ridge(alpha=0.01) rr001.fit(X_train, y_train) Ridge_train_score001 = rr001.score(X_train,y_train) Ridge_test_score001 = rr001.score(X_test, y_test) if(Ridge_test_score001 > high_score): high_score = Ridge_test_score001 alpha_ = 0.01 coefs = rr001.coef_ pred = rr001.predict(X_test) rmse = np.sqrt(MSE(y_test, pred)) rr01 = Ridge(alpha=0.1) rr01.fit(X_train, y_train) Ridge_train_score01 = rr01.score(X_train,y_train) Ridge_test_score01 = rr01.score(X_test, y_test) if(Ridge_test_score01 > high_score): high_score = Ridge_test_score01 alpha_ = 0.1 coefs = rr01.coef_ pred = rr01.predict(X_test) rmse = np.sqrt(MSE(y_test, pred)) rr10 = Ridge(alpha=10) rr10.fit(X_train, y_train) Ridge_train_score10 = rr10.score(X_train,y_train) Ridge_test_score10 = rr10.score(X_test, y_test) if(Ridge_test_score10 > high_score): high_score = Ridge_test_score10 alpha_ = 10 coefs = rr10.coef_ pred = rr10.predict(X_test) rmse = np.sqrt(MSE(y_test, pred)) rr100 = Ridge(alpha=100) rr100.fit(X_train, y_train) Ridge_train_score100 = rr100.score(X_train,y_train) Ridge_test_score100 = rr100.score(X_test, y_test) if(Ridge_test_score100 > high_score): high_score = Ridge_test_score100 alpha_ = 100 coefs = rr100.coef_ pred = rr100.predict(X_test) rmse = np.sqrt(MSE(y_test, pred)) rr1000 = Ridge(alpha=1000) rr1000.fit(X_train, y_train) Ridge_train_score1000 = rr1000.score(X_train,y_train) Ridge_test_score1000 = rr1000.score(X_test, y_test) if(Ridge_test_score1000 > high_score): high_score = Ridge_test_score1000 alpha_ = 1000 coefs = rr1000.coef_ pred = rr1000.predict(X_test) rmse = np.sqrt(MSE(y_test, pred)) data_file.write('\n\t\tRidge Regression Score with alpha=%f: \t%f' % (alpha_, high_score)) data_file.write('\n\t\t\tRMSE: \t\t%f' % (rmse)) if(p==True): print('\n\t\tRidge Regression Score with alpha=%f: \t%f' % (alpha_, high_score)) print('\n\t\tRMSE: \t\t%f' % (rmse)) return np.concatenate((rr001.coef_, rr10.coef_, rr100.coef_, rr1000.coef_), axis=0), np.array(coefs)
X1 = X_train_reduced[train] Y1 = Y_train_raw[train] X2 = X_train_reduced[test] Y2 = Y_train_raw[test] ## Train Classifiers on fold rdg_clf = Ridge(alpha=0.5) rdg_clf.fit(X1, Y1) lso_clf = Lasso(alpha=0.6257) lso_clf.fit(X1, Y1) svr_clf = LinearSVR(C=1e3) svr_clf.fit(X1, Y1) ## Score Classifiers on fold rdg_clf_score = rdg_clf.score(X2, Y2) lso_clf_score = lso_clf.score(X2, Y2) svr_clf_score = svr_clf.score(X2, Y2) print "Ridge: ", rdg_clf_score print "Lasso: ", lso_clf_score print "SVR_RBF: ", svr_clf_score ## Train final Classifiers # clf = Ridge(alpha=.5) clf = LinearSVR(C=1e3, gamma=0.1) clf.fit(X_train_reduced, Y_train_raw) Y_predicted = clf.predict(X_test_reduced) ## Save results to csv
#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=42) train = train.fillna(0.0) test = test.fillna(0.0) x_train = df y_train = x_train.nota.values del x_train['nota'] hyperparams = {'alpha':[0.0005, 0.0014, 0.0006, 0.00061, 0.000612, 0.000613001, 0.000614, 0.00061401, 0.00061402, 0.00061403, 0.0006104 ]} gs = GridSearchCV(estimator=Ridge(normalize=True), param_grid=hyperparams) gs.fit(x_train, y_train) pred = pd.Series(gs.predict(test)) err = gs.score(x_train, y_train) print('Result:') print('Best parameter: ',gs.best_params_) print('Best score: ',gs.best_score_) print('Root mean square logarithmic error: ', err) print('\n') ridge2 = Ridge(alpha = 0.0005, normalize=True) ridge2.fit(x_train, y_train) print(ridge2.score(x_train, y_train)) result = pd.DataFrame(ridge2.predict(test), index = test.index, columns=['nota']) print result #result = result.drop_duplicates(subset='atleta_id', keep="last") #result['atleta_id'] = result['atleta_id'].apply(lambda x:str(x)) result.to_csv('submission.csv')
################# #Regularization ################## #Ridge regression (L2) Penalty (alpha Regularization Parameter) #Ridge Regression leads to dense solutions, in which most coefficients are non-zero from sklearn.linear_model import Ridge ridge_models = {} training_scores = [] test_scores = [] for alpha in [100, 10, 1, .01]: ridge = Ridge(alpha=alpha).fit(X_train, y_train) training_scores.append(ridge.score(X_train, y_train)) test_scores.append(ridge.score(X_test, y_test)) ridge_models[alpha] = ridge plt.plot(training_scores, label="training scores") plt.plot(test_scores, label="test scores") plt.xticks(range(4), [100, 10, 1, .01]) plt.legend(loc="best") #Lasso (L1) Penalty (alpha Regularization Parameter) #LASSO leads to sparse solutions, driving most coefficients to zero from sklearn.linear_model import Lasso lasso_models = {} training_scores = []
# # errors print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # # ------------------------------------- # --- RIDGE REGRESSION --- # boston_rr = Ridge() boston_rr.fit(X_train, y_train) print("Coefficients: ", boston_rr.coef_) print("Intercept: ", boston_rr.intercept_) # R for train and test set print('R2 for train: ', boston_rr.score(X_train, y_train)) print('R2 for test: ', boston_rr.score(X_test, y_test)) # ridge regression - prediction y_pred = boston_rr.predict(X_test) df = pd.DataFrame({'actual': y_test, 'pred': y_pred}) print(df) print(pd.DataFrame(boston_rr.coef_)) # errors print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred)) print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # -------------------------------------
np.random.seed(42) x = np.linspace(0, 20, 21) y = 5 * x + 2 + np.random.normal(0.0, 20.0, 21) # Hint: if you get a shape error from scikit, try: X = x.reshape(21, 1) poly = PolynomialFeatures(30) Xpoly = poly.fit_transform(X) Xscaled = MinMaxScaler().fit_transform(Xpoly) Xtrain, Xtest, ytrain, ytest = train_test_split(Xscaled, y, random_state=42) m = Ridge(alpha=0.1) m.fit(Xtrain, ytrain) m.score(Xtrain, ytrain) m.score(Xtest, ytest) ypred = m.predict(Xscaled) plt.bar(range(31), m.coef_) plt.plot(Xtrain[:, 1], ytrain, 'bo') plt.plot(Xtest[:, 1], ytest, 'kx') plt.plot(Xscaled[:, 1], ypred, 'r-') plt.axis([0.0, 1.0, 0.0, 140.0]) plt.plot(Xtrain[:, 1], ytrain, 'bo') plt.plot(Xtest[:, 1], ytest, 'kx') plt.plot(x, y, 'bo') plt.plot(x, ypred, 'r-') plt.axis([2.0, 20.0, 20.0, 140.0])
print("accuracy",ac1) y_pred1=model2.predict(X_test) print("prediction",y_pred1) #VISIULIZATION plt.scatter(x1,y1,color='red') plt.plot(x1,model2.predict(pol_reg.fit_transform(x1)),color='blue') plt.tittle("Truth or bbluff (linear regression)") plt.xlabel("squarfit_living") plt.ylabel("price") plt.show() #-------------Above model is overfitted-------------------- #to avoid over fitting ridge regression require #apply ridge regression from sklearn.linear_model import Ridge ridmodel=Ridge(alpha=0.000000000000005,normalize=True) ridmodel.fit(X_train,y_train) rid_pre=ridmodel.predict(X_test) print(rid_pre) ac2=ridmodel.score(X_test,y_test) print("accuracy",ac2) #Data visiulization plt.scatter(x1,y1,color='red') plt.plot(x1,ridmodel.predict(pol_reg.fit_transform(x1)),color='blue') plt.tittle("Truth or bbluff (linear regression)") plt.xlabel("squarfit_living") plt.ylabel("price") plt.show()
train_x,text_x,train_y,text_y = cross_validation.train_test_split(X1,y1,train_size=0.5,random_state=1) #f_fold = StratifiedKFold(y=y1,n_folds=10,random_state=1) f_fold = KFold(len(y1),n_folds=10,random_state=0) score = [] mean_square_score_train= [] mean_square_score_test = [] r2_score_train = [] r2_score_test = [] train_stuff=[] test_stuff=[] for k, (train,text) in enumerate(f_fold): predictor.fit(X1[train],y1[train]) c = predictor.score(X1[text],y1[text]) score.append(c) mean_square_score_train.append(mean_squared_error(y1[train],predictor.predict(X1[train]))) mean_square_score_test.append(mean_squared_error(y1[text],predictor.predict(X1[text]))) r2_score_train.append(r2_score(y1[train],predictor.predict(X1[train]))) r2_score_test.append(r2_score(y1[text],predictor.predict(X1[text]))) print "percentage within 7 days error for training data " + str( sum(abs(predictor.predict(X1[train]) - y1[train]) < 7) / len(X1[train]) * 100) print "percentage within 7 days error for testing data " + str( sum(abs(predictor.predict(X1[text]) - y1[text]) < 7) / len(X1[text]) * 100) print "-------------------" train_stuff.append(sum(abs(predictor.predict(X1[train]) - y1[train]) < 7) / len(X1[train]) * 100) test_stuff.append(sum(abs(predictor.predict(X1[text]) - y1[text]) < 7) / len(X1[text]) * 100)
#model generation and prediction from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.linear_model import Lasso #Linear Regression clfreg = LinearRegression(n_jobs=1) clfreg.fit(X_train,y_train) y_pred = clfreg.predict(X_test) confidencereg = clfreg.score(X_test,y_test) #Ridge Regression rr = Ridge(alpha=0.01) rr.fit(X_train,y_train) y_pred_ridge = rr.predict(X_test) confidenceridge = rr.score(X_test,y_test) #Lasso Regression ls = Lasso() ls.fit(X_train,y_train) y_pred_lasso = ls.predict(X_test) confidencelasso = ls.score(X_test,y_test) #plotting learning curves for linear regression import matplotlib.pyplot as plt plt.plot(y_test[:100]) plt.plot(y_pred[:100]) plt.legend(['Actual', 'Linear Predicted'], loc='upper right') plt.show()
def RidgReg(self): r=Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None) r=r.fit(self.exec_data_X,self.exec_data_Y) print("Score for Ridge Regression",end=" ") print(r.score(self.exec_data_X,self.exec_data_Y))
param_grid=parameters, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1) grid_search = grid_search.fit(X_poly[:, 1:], y_train) best_mse = grid_search.best_score_ best_parameters = grid_search.best_params_ ridgeReg = Ridge(fit_intercept=False, normalize=True, alpha=0.01, tol=1e-5, max_iter=13000, solver='auto') ridgeReg.fit(X_poly, y_train) y_pred = ridgeReg.predict(X_poly) sums = (y_pred - y_train)**2 sums = (np.sum(sums)) / len(y_pred) score = ridgeReg.score(X_poly, y_train) print(f'Training error {round(sums * (10**3),3) }') print(f'Traning Score {round(score,3)} \n') prediction = cross_val_predict(ridgeReg, X_poly, y_train, cv=5) sums = (prediction - y_train)**2 sums = (np.sum(sums)) / len(prediction) accuracies = cross_val_score(estimator=ridgeReg, X=X_poly, y=y_train, cv=5) print(f'Validation error {round(sums * (10**3),3) }') print(f'Validation Score {round(accuracies.mean(),3)} \n')
def main(): usage = 'usage: %prog [options] <fasta> <scores>' parser = OptionParser(usage) parser.add_option('-a', dest='canonical_kmers', default=False, action='store_true', help='Count canonical k-mers [Default: %default]') parser.add_option('--alpha', dest='alpha', default=None, type='float', help='Regularization alpha parameter. Will choose via CV if not specified [Default: %default]') parser.add_option('-c', dest='cv_folds', default=0, type='int', help='Cross-validate with this many folds [Default: %default]') parser.add_option('--epsilon', dest='epsilon', default=None, type='float', help='Regularization epsilon parameter. Will choose via CV if not specified [Default: %default]') parser.add_option('-g', dest='gaps', default=0, type='int', help='Gaps in k-mers string kernel [Default: %default]') parser.add_option('-k', dest='k', default=4, type='int', help='K-mer size for string kernel [Default: %default]') parser.add_option('-l', dest='length', default=False, action='store_true', help='Add log2 sequence length as an attribute [Default: %default]') parser.add_option('-m', dest='method', default='ols', help='Regression method [Default: %default]') parser.add_option('-o', dest='output_file', default='seq_regr.txt', help='Output file [Default: %default]') parser.add_option('-w', dest='whiten', default=False, action='store_true', help='Whiten the sequence scores [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide fasta file and scores file') else: fasta_file = args[0] scores_file = args[1] ################################################## # convert sequences to feature representations ################################################## seq_vectors = fasta_string_kernel(fasta_file, options.k, options.gaps, options.canonical_kmers) if options.length: add_length_feature(seq_vectors, fasta_file) ################################################## # read scores ################################################## seq_scores = {} scores_in = open(scores_file) try: line = scores_in.readline() a = line.split() seq_scores[a[0]] = float(a[1]) except: # possible header line pass for line in scores_in: a = line.split() seq_scores[a[0]] = float(a[1]) ################################################## # make scikit-learn data structures ################################################## # shitty method filling in the dense matrix kmers = set() for kmer_vec in seq_vectors.values(): kmers |= set(kmer_vec.keys()) kmers_sort = sorted(kmers) seq_headers = sorted(seq_vectors.keys()) X = np.array([[seq_vectors[header].get(kmer,0) for kmer in kmers_sort] for header in seq_headers]) y = np.array([seq_scores[header] for header in seq_headers]) if options.whiten: y = preprocessing.scale(y) ################################################## # decide method ################################################## if options.method.lower() == 'ols': model = LinearRegression() elif options.method.lower() == 'pls': model = PLSRegression(n_components=2) elif options.method.lower() == 'ridge': if options.alpha: # model = Ridge(alpha=options.alpha) model = RidgeCV(alphas=[options.alpha], store_cv_values=True) else: #model = RidgeCV(alphas=[0.0001, 0.0002, 0.0004, 0.0008, .0016, 0.0032, 0.0064, .0128], store_cv_values=True) model = RidgeCV(alphas=[0.0004, 0.0008, 0.0016, 0.0032], store_cv_values=True) elif options.method.lower() == 'svm': if options.alpha: svm_c = len(y) / options.alpha else: svm_c = 100 if options.epsilon: svm_eps = options.epsilon else: svm_eps = 0.5 model = SVR(kernel='linear', degree=3, C=svm_c, epsilon=svm_eps) elif options.method.lower() == 'gp': model = GaussianProcess() else: print >> sys.stderr, 'Method not recognized.' exit(1) ################################################## # learn model ################################################## model.fit(X, y) ss_tot = sum(np.square(y - np.mean(y))) if options.method.lower() == 'ridge': for i in range(len(model.alphas)): score_cv = (1.0 - sum(model.cv_values_[:,i])/ss_tot) print >> sys.stderr, 'RidgeCV alpha=%.5f score=%f' % (model.alphas[i], score_cv) ################################################## # cross-validate ################################################## if options.cv_folds > 0: scores = [] ss_reg = 0 if options.method.lower() == 'ridge': model_cv = Ridge(alpha=model.alpha_) else: model_cv = copy.copy(model) kf = KFold(len(y), n_folds=options.cv_folds, shuffle=True) for train, test in kf: X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test] # learn on train model_cv.fit(X[train], y[train]) # score on test scores.append(model_cv.score(X_test, y_test)) ss_reg += sum(np.square(y_test - model_cv.predict(X_test))) score_cv = 1 - ss_reg / ss_tot ################################################## # output model information ################################################## model_out = open(options.output_file, 'w') print('Score\t%.3f' % model.score(X, y), file=model_out) if options.cv_folds > 0: print >> model_out, 'ScoreCV\t%.3f' % score_cv if options.method.lower() == 'ridge' and options.alpha: score_cv = (1.0 - sum(model.cv_values_)/ss_tot) print('ScoreCV\t%.3f' % score_cv, file=model_out) for i in range(len(kmers_sort)): if options.method.lower() == 'pls': coef_i = model.coefs[i] else: coef_i = model.coef_[i] print('%s\t%f' % (kmers_sort[i], coef_i), file=model_out) model_out.close()
import mglearn import sklearn import numpy as np import matplotlib.pyplot as plt from sklearn.linear_model import LinearRegression from sklearn.linear_model import Ridge from sklearn.model_selection import train_test_split X, y = mglearn.datasets.load_extended_boston() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) lr = LinearRegression().fit(X_train, y_train) ridge = Ridge().fit(X_train, y_train) print("Training set score: {:.2f}".format(ridge.score(X_train, y_train))) print("Test set score: {:.2f}".format(ridge.score(X_test, y_test))) # Training set score: 0.89 # Test set score: 0.75 ridge10 = Ridge(alpha=10).fit(X_train, y_train) print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train))) print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test))) # Training set score: 0.79 # Test set score: 0.64 ridge01 = Ridge(alpha=0.1).fit(X_train, y_train) print("Training set score: {:.2f}".format(ridge01.score(X_train, y_train))) print("Test set score: {:.2f}".format(ridge01.score(X_test, y_test))) # Training set score: 0.93
X=Salesdf[['perishable', 'item_nbr', 'store_nbr', 'cluster']] y=Salesdf[["unit_sales"]] reg=linear_model.LinearRegression() cv_results=cross_val_score(reg,X_train,y_train,cv=5) print(cv_results) print(np.mean(cv_results)) print(np.std(cv_results)) #Using cross validation of score 5 ridge = Ridge(alpha=0.1, normalize = True) ridge.fit(X_train,y_train) ridge_pred=ridge.predict(X_test) ridge.score(X_test,y_test) #The score is pretty much similar to the linear model built which ensures that the model has passed the Ridge regression test # for regularization #Ridge is used to penalize the loss function by adding the OLS loss function to the square of each coefficient multiplied by alpha.
print("number of test samples:", x_test.shape[0]) print("number of training samples:", x_train.shape[0]) # ### Question 9 # Create and fit a Ridge regression object using the training data, set the regularization parameter to 0.1, and calculate the R^2 using the test data. # # In[26]: from sklearn.linear_model import Ridge # In[27]: RigeModel = Ridge(alpha=0.1) RigeModel.fit(x_train, y_train) RigeModel.score(x_test, y_test) # ### Question 10 # Perform a second order polynomial transform on both the training data and testing data. Create and fit a Ridge regression object using the training data, set the regularisation parameter to 0.1, and calculate the R^2 utilising the test data provided. Take a screenshot of your code and the R^2. # In[28]: pr = PolynomialFeatures(degree=2) x_train_pr = pr.fit_transform(x_train[features]) x_test_pr = pr.fit_transform(x_test[features]) RigeModel = Ridge(alpha=0.1) RigeModel.fit(x_train_pr, y_train) RigeModel.score(x_test_pr, y_test) # <p>Once you complete your notebook you will have to share it. Select the icon on the top right a marked in red in the image below, a dialogue box should open, and select the option all content excluding sensitive code cells.</p>
print("Minimum Error for Ridge Model: ", minimum_error) print("Minimum Error for Lasso Model: ", minimum_error_lasso) def ord_to_char(v, p=None): return chr(int(v)) #Picking up Ridge Model & figuring 10 most useful and 10 least useful parameters for Housing Price Prediction ridgeReg = Ridge() ridgeReg.fit(X,Y) coef = pd.Series(ridgeReg.coef_, index = X.columns) relevant_Coeff = coef.sort_values().tail(10) irrelevant_Coeff = coef.sort_values().head(10) #Plots plt.figure(figsize=(20,10)) relevant_Coeff.plot(kind = "barh", title="Most Relevant Aspects of a House") plt.figure(figsize=(20,10)) irrelevant_Coeff.plot(kind = 'barh', title="Least Relevant Aspects of a House") #Remaining Feature Set plt.figure(figsize= (50,10)) preds = pd.DataFrame({"Predicted":ridgeReg.predict(X), "true":Y}) preds["Difference"] = preds["true"] - preds["Predicted"] preds.plot(x = "Predicted", y = "Difference",kind = "scatter", title = "Residual Features") print (ridgeReg.score(X,Y)) preds = np.expm1(ridgeReg.predict(X_test)) #Exponential function used to balance out log(x + 1) solution = pd.DataFrame({"id":test_DF.Id, "SalePrice":preds}) solution.to_csv("ridge_sol.csv", index = False)
X_final, Y_final = select_Y(final, 19) X_final = select_atributes(X_final, vektors[10]) X_test, Y_test = select_Y(test, 19) X_test = select_atributes(X_test, vektors[10]) poly = preprocessing.PolynomialFeatures(2) X_final = poly.fit_transform(X_final) X_test = poly.fit_transform(X_test) scaler = StandardScaler() scaler = scaler.fit(X_final) X_final = scaler.transform(X_final) X_test = scaler.transform(X_test) trained = RDG.fit(X_final, Y_final) Y_predict = RDG.predict(X_test) print(RDG.score(X_final, Y_final)) print(RDG.score(X_test, Y_test)) Y_mean = np.mean(Y_final) r2 = mtrcs.r2_score(Y_test, Y_predict) mae = mtrcs.mean_absolute_error(Y_test, Y_predict) mse = mtrcs.mean_squared_error(Y_test, Y_predict) mae_predict = np.mean(np.abs(Y_test - Y_mean)) mse_predict = np.mean(np.power(np.abs(Y_test - Y_mean), 2)) msg = "%20s: %10f %10f %10f %10f %10f %10f %10f" % ( "Testing results r2score,MAE,MSE,MAE diff", r2, mae, mse, mae_predict, mse_predict, mae_predict - mae, mse_predict - mse) print(msg) X_plot = select_atributes(final, vektor) for i in range(len(header)): print(header[i]) print(RDG.coef_[0, i + 1])
# train[:,1:] = log10(nbaData[:,1:]) regression = Ridge(alpha=0.05) kf = KFold(len(train),k=10) avgResiduSum = 0 avgVar = 0 for tr, e in kf: regression.fit(train[tr,1:],train[tr,0]) avgResiduSum += mean((regression.predict(train[e,1:]) - train[e,0]) ** 2) # Explained variance score: 1 is perfect prediction avgVar +=regression.score(train[e,1:] , train[e,0]) print '############' print 'Evaluation Phase' avgResiduSum = avgResiduSum/len(kf) print("Average Residual sum of squares: %.2f" % avgResiduSum ) avgVar = avgVar/len(kf) print('Average Variance score: %.2f' % avgVar) print '############' print 'Testing Phase' regression.fit(train[:,1:],train[:,0]) print("Residual sum of squares: %.2f" % mean((regression.predict(nba15test_scaled[:,1:]) - nba15test_scaled[:,0]) ** 2)) # Explained variance score: 1 is perfect prediction print('Variance score: %.2f' % regression.score(nba15test_scaled[:,1:] , nba15test_scaled[:,0]))
X = df[features] Y = df['price'] x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.15, random_state=1) print("number of test samples:", x_test.shape[0]) print("number of training samples:",x_train.shape[0]) #Question 9 #Create and fit a Ridge regression object using the training data, #set the regularization parameter to 0.1, and calculate the R^2 using the test data. from sklearn.linear_model import Ridge RidgeModel = Ridge(alpha = 0.1) RidgeModel.fit(x_train, y_train) RidgeModel.score(x_test, y_test) #Question 10 #Perform a second order polynomial transform on both the training data and testing data. #Create and fit a Ridge regression object using the training data, set the regularisation parameter to 0.1, #and calculate the R^2 utilising the test data provided. Take a screenshot of your code and the R^2. SecondOrderPolynomialTransform = PolynomialFeatures(degree=2) x_train_transformed = SecondOrderPolynomialTransform.fit_transform(x_train) x_test_transformed = SecondOrderPolynomialTransform.fit_transform(x_test) NewRidgeModel = Ridge(alpha = 0.1) NewRidgeModel.fit(x_train_transformed, y_train) NewRidgeModel.score(x_test_transformed, y_test)