def find_best_fit(bagfiles): '''Crop ROS bag files at begining and end to only use the portion in which the robot is moving. Fit the Ridge regression to these cropped bag files. ''' clf = Ridge(alpha=1.0) # TODO: auto-calibrate alpha (it's easy using a scikit-learn one-liner) image_data = [] cmd_vel_data = [] for bagfile_path in bagfiles: most_recent_cmd_vel = None bag = rosbag.Bag(bagfile_path) for topic, msg, t in bag.read_messages(topics=['/camera/image_raw/compressed', '/cmd_vel'], ): if topic == "/cmd_vel" and ((most_recent_cmd_vel is not None) or msg.linear.x>0): if most_recent_cmd_vel is None and msg.linear.x > 0: most_recent_cmd_vel = msg elif most_recent_cmd_vel is not None and msg.linear.x == 0 and msg.angular.z == 0: most_recent_cmd_vel = None elif topic == "/camera/image_raw/compressed" and most_recent_cmd_vel is not None: np_arr = np.fromstring(msg.data, np.uint8) cv_image = cv2.imdecode(np_arr, cv2.CV_LOAD_IMAGE_COLOR) image_data.append(extract_data(cv_image)) cmd_vel_data.append(twist_to_nparray(most_recent_cmd_vel)) clf.fit(image_data, cmd_vel_data) return clf
def test_brr_like_sklearn(): n = 10000 d = 10 sigma_sqr = 5 X = np.random.randn(n, d) beta_true = np.random.random(d) y = np.dot(X, beta_true) + np.sqrt(sigma_sqr) * np.random.randn(n) X_tr = X[:n / 2, :] y_tr = y[:n / 2] X_ts = X[n / 2:, :] # y_ts = y[n / 2:] # prediction with my own bayesian ridge lambda_reg = 1 brr = BayesianRidgeRegression(lambda_reg, add_ones=True, normalize_lambda=False) brr.fit(X_tr, y_tr) y_ts_brr = brr.predict(X_ts) # let's compare to scikit-learn's ridge regression rr = Ridge(lambda_reg) rr.fit(X_tr, y_tr) y_ts_rr = rr.predict(X_ts) assert np.mean(np.abs(y_ts_brr - y_ts_rr)) < 0.001, \ "Predictions are different from sklearn's ridge regression."
def test_sag_regressor_computed_correctly(): """tests if the sag regressor is computed correctly""" alpha = .1 n_features = 10 n_samples = 40 max_iter = 50 tol = .000001 fit_intercept = True rng = np.random.RandomState(0) X = rng.normal(size=(n_samples, n_features)) w = rng.normal(size=n_features) y = np.dot(X, w) + 2. step_size = get_step_size(X, alpha, fit_intercept, classification=False) clf1 = Ridge(fit_intercept=fit_intercept, tol=tol, solver='sag', alpha=alpha * n_samples, max_iter=max_iter) clf2 = clone(clf1) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) spweights1, spintercept1 = sag_sparse(X, y, step_size, alpha, n_iter=max_iter, dloss=squared_dloss, fit_intercept=fit_intercept) spweights2, spintercept2 = sag_sparse(X, y, step_size, alpha, n_iter=max_iter, dloss=squared_dloss, sparse=True, fit_intercept=fit_intercept) assert_array_almost_equal(clf1.coef_.ravel(), spweights1.ravel(), decimal=3) assert_almost_equal(clf1.intercept_, spintercept1, decimal=1)
def reg_skl_ridge(param, data): [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(X_tr, y_reg_tr) pred = ridge.predict(X_cv) RMSEScore = getscoreRMSE(y_reg_cv, pred) return RMSEScore, pred
def fit(self, X, Y, weights=None, context_transform=True): """ Trains policy by weighted maximum likelihood. .. note:: This call changes this policy (self) Parameters ---------- X: array-like, shape (n_samples, context_dims) Context vectors Y: array-like, shape (n_samples, weight_dims) Low-level policy parameter vectors weights: array-like, shape (n_samples,) Weights of individual samples (should depend on the obtained reward) """ # Kernel approximation self.nystroem = Nystroem( kernel=self.kernel, gamma=self.gamma, coef0=self.coef0, n_components=np.minimum(X.shape[0], self.n_components), random_state=self.random_state, ) self.X = self.nystroem.fit_transform(X) if self.bias: self.X = np.hstack((self.X, np.ones((self.X.shape[0], 1)))) if self.normalize: self.X /= np.abs(self.X).sum(1)[:, None] # Standard ridge regression ridge = Ridge(alpha=self.alpha, fit_intercept=False) ridge.fit(self.X, Y, weights) self.W = ridge.coef_
def test_regressor_matching(): n_samples = 10 n_features = 5 rng = np.random.RandomState(10) X = rng.normal(size=(n_samples, n_features)) true_w = rng.normal(size=n_features) y = X.dot(true_w) alpha = 1. n_iter = 100 fit_intercept = True step_size = get_step_size(X, alpha, fit_intercept, classification=False) clf = Ridge(fit_intercept=fit_intercept, tol=.00000000001, solver='sag', alpha=alpha * n_samples, max_iter=n_iter) clf.fit(X, y) weights1, intercept1 = sag_sparse(X, y, step_size, alpha, n_iter=n_iter, dloss=squared_dloss, fit_intercept=fit_intercept) weights2, intercept2 = sag(X, y, step_size, alpha, n_iter=n_iter, dloss=squared_dloss, fit_intercept=fit_intercept) assert_array_almost_equal(weights1, clf.coef_, decimal=10) assert_array_almost_equal(intercept1, clf.intercept_, decimal=10) assert_array_almost_equal(weights2, clf.coef_, decimal=10) assert_array_almost_equal(intercept2, clf.intercept_, decimal=10)
def test_sag_pobj_matches_ridge_regression(): """tests if the sag pobj matches ridge reg""" n_samples = 100 n_features = 10 alpha = 1.0 n_iter = 100 fit_intercept = False rng = np.random.RandomState(10) X = rng.normal(size=(n_samples, n_features)) true_w = rng.normal(size=n_features) y = X.dot(true_w) clf1 = Ridge(fit_intercept=fit_intercept, tol=.00000000001, solver='sag', alpha=alpha, max_iter=n_iter, random_state=42) clf2 = clone(clf1) clf3 = Ridge(fit_intercept=fit_intercept, tol=.00001, solver='lsqr', alpha=alpha, max_iter=n_iter, random_state=42) clf1.fit(X, y) clf2.fit(sp.csr_matrix(X), y) clf3.fit(X, y) pobj1 = get_pobj(clf1.coef_, alpha, X, y, squared_loss) pobj2 = get_pobj(clf2.coef_, alpha, X, y, squared_loss) pobj3 = get_pobj(clf3.coef_, alpha, X, y, squared_loss) assert_array_almost_equal(pobj1, pobj2, decimal=4) assert_array_almost_equal(pobj1, pobj3, decimal=4) assert_array_almost_equal(pobj3, pobj2, decimal=4)
class OrderScorer(Scorer): def __init__(self): self.classifier = Ridge(alpha=0.1) self.cache_filename = 'subgraph_order_scorer_reg.pickle' def train(self, train_instances, train_labels, update_cache=True, sample_weight=None): """ Trains a scorer to score the quality of an ordering of sentences Loads from cache if available """ self.classifier.fit(train_instances, train_labels, sample_weight=sample_weight) if update_cache: pickle.dump(self.classifier, open(self.cache_filename, 'wb')) def test(self, test_instances, test_labels): """ Uses test set to evaluate the performance of the scorer and print it out """ scores = self.classifier.predict(test_instances) # TODO: print report def load(self): if os.path.exists(self.cache_filename): self.classifier = pickle.load(open(self.cache_filename, 'rb')) else: raise Exception("No classifier exists! Must call train with update_cache=True") def evaluate(self, test_instance): """ Applies the scoring function to a given test instance """ return self.classifier.predict([test_instance])[0]
def _make_forecast(self, model, name, alpha=None, l1_ratio=None): """ Output: DataFrame Train on the holdout set and make predictions for the next week """ X_hold = self.hold_set[self.hold_set.columns[1:]] if 'lyft' in self.filename: y_hold = self.hold_set['avg_est_price'] else: y_hold = self.hold_set['avg_price_est'] if name.split("_")[0] == "ridgecv": model = Ridge(alpha=alpha) elif name.split("_")[0] == "lassocv": model = Lasso(alpha=alpha) elif name.split("_")[0] == "elasticnetcv": model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) model.fit(X_hold, y_hold) self.X_forecast = X_hold.copy() # assumes weekofyear is increasing self.X_forecast['weekofyear'] = self.X_forecast['weekofyear'].apply(lambda x: x+1) self.X_forecast.index = self.X_forecast.index + pd.Timedelta(days=7) self.y_forecast = model.predict(self.X_forecast) self.y_forecast = pd.DataFrame(self.y_forecast, index=self.X_forecast.index, columns=['y_forecast']) self.y_forecast = pd.concat([self.X_forecast, self.y_forecast], axis=1) saved_filename = "rideshare_app/data/{}_forecast.csv".format(name) self.y_forecast.to_csv(saved_filename) print "saved prediction values to {}".format(saved_filename)
def training(X,Y,X_test, pca='kpca', regressor='ridge', dim=50): # X and Y are numpy arrays print 'Input data and label shape: ', X.shape, Y.shape if pca == 'nopca': return simpleTraining(X, Y, X_test, regressor) model, P = getProjectionMatrixPCA(Y, dim) if pca=='pca' else getProjectionMatrixKPCA(dim) Y_train = np.dot(Y, P) if pca=='kpca' else np.dot(Y,P.transpose()) regressors = [] for i in range(dim): print 'at regressor number: ', i reg = Ridge() if regressor=='ridge' else SVR() y = [x[i] for x in Y_train] reg.fit(X, y) regressors.append(reg) Z_pred = [] for reg in regressors: Z_pred.append(reg.predict(X_test)) print 'prediction shapes:' , len(Z_pred), len(Z_pred[0]) Z_pred = np.array(Z_pred) Y_pred = np.dot(P, Z_pred).transpose() if pca=='kpca' else np.dot(Z_pred.transpose(), P) return model, regressors, Y_pred
class LogisticRegressionSeparator(BaseEstimator): def get_params(self, deep=True): return {} def fit(self, X, y): # lets predict which users will spend anything later classes = y - X[:, 0] classes = np.where(classes > 0.1, 1, 0) self.classifier = LogisticRegression( class_weight='balanced') self.classifier.fit(X, classes) results = self.classifier.predict(X) results = results == 1 self.estimator = Ridge(alpha=0.05) self.estimator.fit(X[results], y[results]) def predict(self, X): y = X[:,0].reshape(X.shape[0]) labels = (self.classifier.predict(X) == 1) y[labels] = self.estimator.predict(X[labels]) return y
def train_single_model(train_data, train_labels, algo): """ Train the model for a single label dimension """ if algo == 'svr_rbf': """ SVM regression, RBF kernel """ svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) svr_rbf.fit(train_data, train_labels) return svr_rbf if algo == 'svr_lin': """ SVM regression, linear """ svr_lin = SVR(kernel='linear') svr_lin.fit(train_data, train_labels) return svr_lin if algo == 'ridge': """ Ridge regression """ clf = Ridge(alpha = 0.5) clf.fit(train_data, train_labels) return clf # No hit algorithm print "unimplemented model type" return None
def regression_weight(self, matched_data): converted_data = {} for i, data in enumerate(matched_data): if i==0: for key in data.keys(): try: value = float(data[key]) converted_data[key] = [value] except ValueError: pass else: for key in data.keys(): if key in converted_data: converted_data[key].append(float(data[key])) sorted_key = sorted(converted_data.keys()) input_key = [key for key in sorted_key if key != self.main_key.lower()] x = [] for key in input_key: # normalization numpy_data = normalization(np.array(converted_data[key])) x.append(numpy_data) x = np.array(x).T y = normalization(np.array(converted_data[self.main_key.lower()])) regressor = Ridge(alpha=1.0, normalize=True) regressor.fit(x,y) sorted_result = np.array(input_key)[np.argsort(np.array(regressor.coef_))] sorted_result = sorted_result[::-1] coefficient = sorted(regressor.coef_, reverse = True) return [(sorted_result[i], coefficient[i]) for i in range(len(sorted_result))]
def ridge_regression(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return:the predicted values,learning curve, validation curve """ lin = Ridge(alpha=0.5) if get_model: print "Fitting Ridge..." lin.fit(train_x, np.log(train_y+1)) gbr_pred = np.exp(lin.predict(pred_x))- 1 for i in range(len(gbr_pred)): if gbr_pred[i] < 0: gbr_pred[i] = 0 Votes = gbr_pred[:, np.newaxis] Id = np.array(review_id)[:, np.newaxis] submission_lin= np.concatenate((Id,Votes),axis=1) np.savetxt("submission_ridge.csv", submission_lin,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') if v_curve: print "Working on Validation Curves" plot_validation_curve(Ridge(), "Validation Curve for Ridge Regression", train_x, np.log(train_y+1.0), param_name="alpha", param_range=[0.1,0.2,0.5,1,10]) if l_curve: print "Working on Learning Curves" plot_learning_curve(Ridge(), "Learning Curve for Linear Regression", train_x, np.log(train_y+1.0))
def ridgeRegression(X,y): print("\n### ~~~~~~~~~~~~~~~~~~~~ ###") print("Ridge Regression") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myDegree = 40 polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False) Xp = polynomialFeatures.fit_transform(X) myScaler = StandardScaler() scaled_Xp = myScaler.fit_transform(Xp) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ridgeRegression = Ridge(alpha=1e-11,solver="cholesky") ridgeRegression.fit(scaled_Xp,y) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dummyX = np.arange(0,2,0.01) dummyX = dummyX.reshape((dummyX.shape[0],1)) dummyXp = polynomialFeatures.fit_transform(dummyX) scaled_dummyXp = myScaler.transform(dummyXp) dummyY = ridgeRegression.predict(scaled_dummyXp) outputFILE = 'plot-ridgeRegression.png' fig, ax = plt.subplots() fig.set_size_inches(h = 6.0, w = 10.0) ax.axis([0,2,0,15]) ax.scatter(X,y,color="black",s=10.0) ax.plot(dummyX, dummyY, color='red', linewidth=1.5) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def knn_twice(k): knn1 = neighbors.KNeighborsRegressor(n_neighbors=k) knn1.fit(trainf,trainlab) print 'here' tim = time.time(); n = len(train)/1000 pred1 = [] for i in range(0,n): pred1.extend(knn1.predict(trainf[(i*1000):((i+1)*(1000))])) print(i) pred1.extend(knn1.predict(trainf[67000:67946])) print "time: " + str(time.time() - tim) #knn = neighbors.KNeighborsRegressor(n_neighbors=k) #knn.fit(pred1,trainlab) ridge = Ridge(alpha=1.0) ridge.fit(pred1, trainlab) n = 10 pred2 = [] for i in range(0,n): pred2.extend(knn1.predict(testf[(i*1000):((i+1)*(1000))].toarray())) print(i) n = 10 pred = [] for i in range(0,n): pred.extend(ridge.predict(pred2[(i*1000):((i+1)*(1000))])) print(i) #RMSE: testlab = np.array(test.ix[:,4:]) err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0)))) return err
def forecast_future_attention(train_index, test_index, alpha): """Forecast future attention via train dataset index and test dataset index.""" m, n = len(train_index), len(test_index) x_train_predict = attention_data[train_index, :num_train] x_test_predict = attention_data[test_index, :num_train] for i in xrange(num_train, age): if with_share == 1: x_train = np.hstack((x_train_predict, share_data[train_index, :i + 1])) x_test = np.hstack((x_test_predict, share_data[test_index, :i + 1])) norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1), share_data[train_index, :i + 1])) else: x_train = x_train_predict x_test = x_test_predict norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1))) x_train_norm = x_train / np.sum(norm, axis=1)[:, None] y_train = np.ones(m, ) # == == == == == == == == Training with Ridge Regression == == == == == == == == # predictor = Ridge(fit_intercept=False, alpha=alpha) predictor.fit(x_train_norm, y_train) # == == == == == == == == Iteratively add forecasted value to x matrix == == == == == == == == # predict_train_value = (predictor.predict(x_train) - np.sum(x_train, axis=1)).reshape(m, 1) predict_train_value[predict_train_value < 0] = 0 x_train_predict = np.hstack((x_train_predict, predict_train_value)) predict_test_value = (predictor.predict(x_test) - np.sum(x_test, axis=1)).reshape(n, 1) predict_test_value[predict_test_value < 0] = 0 x_test_predict = np.hstack((x_test_predict, predict_test_value)) return x_test_predict[:, num_train: age]
def bowFitAndPrediction(predictData, textSeries, outcome,typeModel='binary'): print "Bag of words for %s" % (textSeries.name) if typeModel == 'continuous': bowModel = Ridge(alpha = 0.001) else: bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=1, intercept_scaling=1, class_weight=None, random_state=423) vectorizer = getFeatures(textSeries) X_train = vectorizer.transform(predictData) #Outcomes Y_train = outcome #Logistic regression, not sure if best bowModel.fit(X_train,Y_train) #Comment out later, fitting on CV data if typeModel == 'continuous': predict = bowModel.predict(X_train) yhat = predict else: predict = bowModel.predict_proba(X_train) yhat = predict[:,1] return (yhat, vectorizer, bowModel)
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=500,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred*(1.0/(j+1)) print j,llfun(y_test_cv,yqq) #y_pred/=m; clf=Ridge()#RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv)) y_pred=clf.predict(X_test_cv) print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def RidgeRegression(self,filename,outputFile): pheno,geno = self.inputParse(filename) for row in geno: if len(row)%2 !=0: return "Rows are not even." maxGeno = max(geno) allGeno = list(set(maxGeno)) encoder = [i for i in range(len(allGeno))] lengthGeno = len(geno) length = len(geno) lenInnerGeno = len(geno[0]) genoMake = [0 for x in range(len(allGeno))] dictionary = dict(zip(allGeno,encoder)) for i in range(length): for x in range(lenInnerGeno): geno[i][x] = dictionary[geno[i][x]] phenoNaN = [] for i in range(len(pheno)): if pheno[i] == 'NaN': phenoNaN.append(i) phenoNaN.reverse() for i in phenoNaN: del pheno[i] genoMiss = [] for i in range(len(geno)): if i not in phenoNaN: genoMiss.append(geno[i]) pheno = [float(i) for i in pheno] alpha = self.alphaOptimization(genoMiss,pheno) clf = Ridge(alpha = alpha) clf.fit(genoMiss,pheno) predicted = clf.predict(geno) predicted = np.transpose(predicted) np.savetxt(outputFile,np.transpose(predicted))
def cross_valid(X,Y,n_fold): clf = Ridge(alpha=1.0) total_mean_square = 0 total_coef = 0 Y_np = np.array(Y) n_samples, n_features = len(X), len(X[0]) kf_Y = cross_validation.KFold(n_samples, n_fold) index = [] preds = [] truths = [] for train_index, test_index in kf_Y: X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y_np[train_index], Y_np[test_index] clf.fit(X_train,y_train) y_pred = clf.predict(X_test) index += test_index.tolist() preds += map(lambda x: 1 if x > 0.5 else 0 ,y_pred.tolist()) truths += y_test.tolist() #print "predict:",map(lambda x: 1 if x > 0.5 else 0,y_pred) #print "original:",y_test total_mean_square += mean_squared_error(y_test,y_pred) total_coef += clf.coef_ #print 'Coefficient of the prediction (pearsonr): ' , pearsonr(y_pred,y_test) print 'All Coefficient of the prediction (pearsonr): ' , pearsonr(truths,preds) print 'Average mean squared error is: ' , total_mean_square / n_fold diff_count = sum([abs(truth - pred) for truth, pred in zip(truths, preds)]) acc = 100-1.* diff_count/len(truths)*100 print 'prediction accuracy is %f'%(acc) return [total_coef, index , preds]
def fit_strf_ridge(input, output, lags, alpha=1.0, verbose=False): #convert the input into a toeplitz-like matrix if verbose: nt,nf = input.shape nelems = nt*nf*len(lags) mem = (nelems*8.) / 1024.**2 print '[fit_strf_ridge] estimated size of toeplitz matrix: %d MB' % mem stime = time.time() A = make_toeplitz(input, lags, include_bias=False) etime = time.time() - stime if verbose: print '[fit_strf_ridge] Time to make Toeplitz matrix: %d seconds' % etime #fit the STRF stime = time.time() #rr = Ridge(alpha=alpha, copy_X=False, fit_intercept=True) rr = Ridge(alpha=alpha, fit_intercept=True) rr.fit(A, output) etime = time.time() - stime if verbose: print '[fit_strf_ridge] Time to fit STRF: %d seconds' % etime #reshape the STRF so that it makes sense nt = input.shape[0] nf = input.shape[1] d = len(lags) strf = np.array(rr.coef_).reshape([nf, d]) bias = rr.intercept_ return strf,bias
def ridge_regressor(df): """ INPUT: Pandas dataframe OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients """ y = df.pop("price").values X = df.values feature_names = df.columns xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0) clf = Ridge(alpha=1.0) clf.fit(xtrain, ytrain) score = clf.score(xtest, ytest) feat_imps = clf.coef_ ypredict = clf.predict(xtest) mae = np.mean(np.absolute(ytest - ypredict)) mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest) return ( "R^2 is ", score, "RMSE is ", rmse, "MAE percent is ", mae_percent, "Feature coefficients are ", zip(feature_names, feat_imps), )
def Ridge_model(train_linear, test_linear): ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(train_linear_fea, train_linear_tar) ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) #ridge.set_params(alpha=6,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) write_pkl(ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl') return test_prediction_ridge
class RidgeRegressionModel(LinearLeastSquaresModel): def __init__(self, input_columns, output_columns, debug=False): self.alpha = 0.0000000001 self.m = Ridge(alpha=self.alpha) super(RidgeRegressionModel, self).__init__(input_columns, output_columns, debug=debug) def fit(self, data): A = numpy.vstack([data[:,i] for i in self.input_columns]).T B = numpy.vstack([data[:,i] for i in self.output_columns]).T self.m.fit(A, B) return self.m.coef_ #m.intercept_ def get_error(self, data, model): A = numpy.vstack([data[:,i] for i in self.input_columns]).T B = numpy.vstack([data[:,i] for i in self.output_columns]).T B_fit = scipy.dot(A, model) err_per_point = numpy.sum((B-B_fit)**2, axis=1) # sum squared error per row norm = numpy.sqrt(model*model) assert norm.shape == (1,1) regularizer = 1.0*norm[0,0] return err_per_point - regularizer
def ridge_regression(data,target,alphas): plt.figure() mean_rmses=[] kf=KFold(len(target),10,True,None) for alpha0 in alphas: rmses=[] clf=Ridge(alpha=alpha0,normalize=True,solver='svd') for train_index, test_index in kf: data_train,data_test=data[train_index],data[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) mean_rmses.append(np.mean(rmses)) x0=np.arange(1,11) plt.plot(x0,rmses,label='alpha='+str(alpha0),marker='o') lr = linear_model.LinearRegression(normalize = True) rmses = [] for train_index, test_index in kf: data_train, data_test = data[train_index], data[test_index] target_train, target_test = target[train_index], target[test_index] lr.fit(data_train, target_train) rmse = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2)) rmses.append(rmse) mean_rmses.append(np.mean(rmses)) x0=np.arange(1,11) plt.plot(x0,rmses,label='linear',marker='*') plt.title("RMSE comparison between different alpha values of Ridge regularization") plt.legend() plt.show() # print(mean_rmses) return mean_rmses
def _check_ridge_model(featureses, labels): """Plot ridge regression predictions""" for tfidf_count in FEATURES_SIZES: test_points = [] for i in range(16): tmp = [i, 100] tmptmp = [0] * tfidf_count if tmptmp: tmp.extend(tmptmp) test_points.append(tmp) test_points = np.array(test_points) limit = tfidf_count + 2 model = Ridge() model.fit(featureses[:, :limit], labels) predictions = model.predict(test_points) plt.plot( predictions, label=str(tfidf_count), linestyle=next(LINECYCLER), linewidth=3) # plt.text(test_points[-1, 0], predictions[-1], str(tfidf_count)) plt.legend() plt.xlabel('Document order') plt.ylabel('Time (seconds)') plt.savefig('ridge_predictions.pdf')
def traverse_movies_ridge(): LBMAP = getLBMap() DMAP = createEmpty() P_ERRORS, ERRORS = [], [] training_data, training_response = [], [] for i in range(len(data)): movie = data[i] m_rev = movie['revenue'] myvector = vectorizeMovie(movie, LBMAP, DMAP) if i > 100: model = Ridge(alpha = .5) model.fit(training_data, training_response) raw = math.fabs(model.predict(myvector) - m_rev) ERRORS.append(raw) #P_ERRORS.append(round(raw/m_rev, 4)) training_data.append(myvector) training_response.append(m_rev) DMAP = update(movie, DMAP) #print 'all', avg_float_list(P_ERRORS) print 'all', avg_float_list(ERRORS)
def compute_linear_model(mfs, measures): from sklearn.linear_model import Ridge from sklearn import linear_model # try different ones clf = Ridge(alpha = 1.0) #clf = RidgeCV(alphas=[0.1, 1.0, 10.0]) #clf = linear_model.LinearRegression() # explain fexp using BMD + the MFS data fexp = measures[:, measures.shape[1]-1] bmd = measures[:, 0] bmd = bmd.reshape((bmd.shape[0], 1)) #print "BMD: ", bmd #print "FEXP: ", fexp #print "MFS; ", mfs #PCA #from sklearn.decomposition import PCA #pca = PCA(n_components=12) #pca.fit(mfs) #mfs_pca = pca.transform(mfs) X = np.hstack((bmd, mfs)) clf.fit(X, fexp) # Results #print "Coefs:", clf.coef_ print "Score (R^2):", clf.score(X, fexp)
def impute_age(): X, P = gfa.platform_expression("GPL96") model = impute.KNNImputer() Xi = model.fit_transform(X, axis=1) age = array(P["age"].tolist()) Xm = Xi.as_matrix() ix = array((age >= 10) & (age <= 120)).nonzero()[0] np.random.shuffle(ix) Xm = Xm[ix, :] age = age[ix] n_train = 2000 n_test = 500 # clf = SVR(C=1e-5, epsilon=1) # clf = LinearRegression() clf = Ridge() # clf = SimpleRegressor() # clf = Lasso() clf.fit(Xm[:n_train, :], age[:n_train]) y = age[n_train : (n_train + n_test)] y_hat = clf.predict(Xm[n_train : (n_train + n_test)]) dy = y - y_hat bias_tr = y_hat.mean() - age.mean() print("\nBias (vs train):\t\t", bias_tr) print("Bias (vs test):\t\t\t", dy.mean()) print("Mean error:\t\t\t", fabs(dy).mean()) print("Mean error (bias corrected):\t", fabs(dy - bias_tr).mean()) print("MSE:\t\t\t\t", np.power(dy, 2).mean())
mask[(points[0]).astype(np.int), (points[1]).astype(np.int)] = 1 mask = ndimage.gaussian_filter(mask, sigma=l / n_pts) res = np.logical_and(mask > mask.mean(), mask_outer) return np.logical_xor(res, ndimage.binary_erosion(res)) # Generate synthetic images, and projections l = 128 proj_operator = build_projection_operator(l, l // 7) data = generate_synthetic_data() proj = proj_operator * data.ravel()[:, np.newaxis] proj += 0.15 * np.random.randn(*proj.shape) # Reconstruction with L2 (Ridge) penalization rgr_ridge = Ridge(alpha=0.2) rgr_ridge.fit(proj_operator, proj.ravel()) rec_l2 = rgr_ridge.coef_.reshape(l, l) # Reconstruction with L1 (Lasso) penalization # the best value of alpha was determined using cross validation # with LassoCV rgr_lasso = Lasso(alpha=0.001) rgr_lasso.fit(proj_operator, proj.ravel()) rec_l1 = rgr_lasso.coef_.reshape(l, l) plt.figure(figsize=(8, 3.3)) plt.subplot(131) plt.imshow(data, cmap=plt.cm.gray, interpolation='nearest') plt.axis('off') plt.title('original image') plt.subplot(132)
house_lasso_reg = Lasso(alpha=1.0, max_iter=100000, normalize=True, tol=0.0001) house_lasso_reg = house_lasso_reg.fit(house_train.drop("SalePrice", axis=1), house_train["SalePrice"]) # Predict using the model house_lasso_pred = house_lasso_reg.predict(house_test.drop("SalePrice", axis=1)) # In[ ]: #L2 (Ridge) Regularization house_ridge_reg = Ridge(alpha=1.0, max_iter=100000, normalize=True, solver='lsqr', tol=0.001) house_ridge_reg = house_ridge_reg.fit(house_train.drop("SalePrice", axis=1), house_train["SalePrice"]) #Predict using the model house_ridge_pred = house_ridge_reg.predict(house_test.drop("SalePrice", axis=1)) ##################### ###Cross-Validation Ridge house_ridge_CV_reg = RidgeCV(alphas=(0.01, 0.1, 1.0, 10.0), normalize=True, cv=10) house_ridge_CV_reg = house_ridge_CV_reg.fit( house_train.drop("SalePrice", axis=1), house_train["SalePrice"]) # Predict using the model house_ridge_CV_reg_pred = house_ridge_CV_reg.predict( house_test.drop("SalePrice", axis=1))
scores_mse_ridge_scikit_train = [] scores_r2_ridge_scikit_train = [] scores_mse_ridge_scikit_val = [] scores_r2_ridge_scikit_val = [] scores_mse_ridge_scikit_test = [] scores_r2_ridge_scikit_test = [] alphas = [1.0, 10.0, 100.0, 1000.0, 10000.0, 100000.0] for alpha in alphas: # Initialize scikit-learn ridge regression model model_ridge_scikit = RidgeRegression(alpha=alpha) # Trains scikit-learn ridge regression model model_ridge_scikit.fit(x_poly_train, y_train) print('Results for scikit-learn RidgeRegression model with alpha={}'. format(alpha)) # Test model on training set score_mse_ridge_scikit_train = score_mean_squared_error( model_ridge_scikit, x_poly_train, y_train) print('Training set mean squared error: {:.4f}'.format( score_mse_ridge_scikit_train)) score_r2_ridge_scikit_train = model_ridge_scikit.score( x_poly_train, y_train) print('Training set r-squared scores: {:.4f}'.format( score_r2_ridge_scikit_train))
if __name__ == "__main__": x, y = GetData_x_y('resources/abalone.txt') weights = ridgeTest(x, y) fig = plt.figure() ax = fig.add_subplot(111) plt.xlabel('log(lambda)') plt.ylabel('regression coff') x_range = [i - 10 for i in range(numTestpts)] ax.plot(x_range, weights) plt.show() print('***********用sklearn库的岭回归进行拟合****************') clf = Ridge(alpha=.5) clf.fit(x, y) print(clf.coef_) print(clf.intercept_) print( clf.predict( np.array([1, 0.455, 0.365, 0.095, 0.514, 0.2245, 0.101, 0.15, 1]))) print("******************用自己编写的岭回归代码进行拟合******************") print(weights[15][0:len(weights[0]) - 1]) print(weights[15][-1]) print( np.dot( weights[15], np.array([1, 0.455, 0.365, 0.095, 0.514, 0.2245, 0.101, 0.15, 1]).T))
dict_vect_matrix = dict_vect.fit_transform(lal) print(dict_vect_matrix, dict_vect_matrix.toarray(), sep='\n\n') vectorizer_feats = DictVectorizer() X_train_feats = vectorizer_feats.fit_transform( X_train[feats].fillna('-').T.to_dict().values()) X_valid_feats = vectorizer_feats.transform( X_valid[feats].fillna('-').T.to_dict().values()) X_test_feats = vectorizer_feats.transform( X_test[feats].fillna('-').T.to_dict().values()) X_train_new = scipy.sparse.hstack( (X_train_title, X_train_title2, X_train_feats)) X_valid_new = scipy.sparse.hstack( (X_valid_title, X_valid_title2, X_valid_feats)) X_test_new = scipy.sparse.hstack((X_test_title, X_test_title2, X_test_feats)) '''model1 = Ridge(alpha=0.1, random_state=1) model1.fit(X_train_new, y_train) train_pred1 = model1.predict(X_train_new) valid_pred1 = model1.predict(X_valid_new) print(mean_squared_error(y_train, train_pred1), mean_squared_error(y_valid, valid_pred1)) model2 = Ridge(alpha=1.0, random_state=1) model2.fit(X_train_new, y_train) train_pred2 = model2.predict(X_train_new) valid_pred2 = model2.predict(X_valid_new) print(mean_squared_error(y_train, train_pred2), mean_squared_error(y_valid, valid_pred2)) ''' model = Ridge(random_state=17) train_data = scipy.sparse.vstack((X_train_new, X_valid_new)) model.fit(train_data, y) print(mean_squared_error(y_valid, model.predict(X_valid_new)))
def regression(trainfile, testfile, resultsfile, learner, weightdata=False, writefile=True): start_time = time.time() X, y = readdata2(False, trainfile) X = np.array(X) y = np.array(y) #print X,y bestalpha = -1 bestscore = -1e200 bestalphaSE = -1 bestscoreSE = 1e200 ncv = 10 if (len(y) < 20): ncv = len(y) if (learner == 'Ridge' or learner == 'Lasso'): alphalist = np.append(np.logspace(-7, 1, 10), [0]) elif (learner == 'BayesianRidge'): alphalist = [0] for alpha in alphalist: kf = KFold(len(y), n_folds=ncv) nd = 0 MSE_v = 0.0 loglkl = 0.0 loglkl1 = 0.0 for train_index, test_index in kf: X_train, X_v = X[train_index], X[test_index] y_train, y_v = y[train_index], y[test_index] if weightdata: cond = np.abs(X_train[:, 0] - y_train) < 0.004 print float(np.count_nonzero(cond)) / y_train.shape[0] if (learner == 'Ridge'): reg1 = Ridge(alpha=alpha) elif (learner == 'BayesianRidge'): reg1 = BayesianRidge() elif (learner == 'Lasso'): reg1 = Lasso(alpha=alpha) reg1.fit(X_train, y_train) predytrain = reg1.predict(X_train) predy_v = reg1.predict(X_v) MSE_v += np.dot(np.array(predy_v - y_v), np.array(predy_v - y_v)) #/float(len(y_v)) if (learner != 'BayesianRidge'): STD = math.sqrt( np.dot(np.array(y_train - predytrain), np.array(y_train - predytrain)) / float(len(predytrain) - 1.0)) + 1e-12 # to avoid problems loglkl += loglklnormal(y_v, predy_v, STD) if (learner != 'BayesianRidge'): if (loglkl > bestscore): bestscore = loglkl bestalpha = alpha if (MSE_v < bestscoreSE): bestscoreSE = MSE_v bestalphaSE = alpha #print bestalpha,bestscore #print bestalphaSE,bestscoreSE # retrain on all the dataset if (learner == 'Ridge'): reg = Ridge(alpha=bestalpha) elif (learner == 'BayesianRidge'): reg = BayesianRidge(compute_score=True) elif (learner == 'Lasso'): reg = Lasso(alpha=bestalpha) #reg=Ridge(alpha=bestalpha) reg.fit(X, y) predy = reg.predict(X) vartrain = np.dot(np.array(predy - y), np.array(predy - y)) / float(len(y) - 1.0) + 1e-12 #print vartrain Xtest, ytest = readdata2(False, testfile) SEtest = 0.0 RMSEtest = 0.0 loglklTest = 0 if len(ytest) > 0: ypred = reg.predict(Xtest) SEtest = np.dot(np.array(ypred - ytest), np.array(ypred - ytest)) print SEtest / float(len(ytest)) my = sum(ytest) / float(len(ytest)) vv = [(yy - my) * (yy - my) for yy in ytest] # print (1-reg.score(Xtest, ytest))*sum(vv) RMSEtest = math.sqrt(SEtest / float(len(ytest))) loglklTest = loglklnormal(ytest, ypred, math.sqrt(vartrain)) #print bestscore,loglklTest #print reg.alpha_ #print reg.intercept_,reg.coef_ #print "HERE" if writefile: param = open(resultsfile, 'w') #print "Writing to: ",resultsfile param.write("bias and coefficients,") param.write(str(reg.intercept_)) for c in reg.coef_: param.write("," + str(c)) param.write("\n") param.write("STD train,") param.write(str(math.sqrt(vartrain))) param.write("\n") param.write("sum loglikelihood CV train,") if (learner == 'BayesianRidge'): param.write(str(reg.scores_[-1])) else: param.write(str(bestscore)) param.write("\n") param.write("sum loglikelihood test,") param.write(str(loglklTest)) param.write("\n") param.write("sum squared error CV train,") param.write(str(bestscoreSE)) param.write("\n") param.write("sum squared error test,") param.write(str(SEtest)) param.write("\n") print RMSEtest # param.write("Root MSE (STD) CV train,") # param.write(str(math.sqrt(bestscore))) # param.write("\n") # param.write("squared error sum (score) CV train,") # param.write(str(bestscore*len(y))) # param.write("\n") # param.write("squared error sum test,"+ str(SEtest)) # param.write("\n") # param.write("Root MSE test,"+ str(RMSEtest)) # param.write("\n") param.write(str(type(reg)) + ",") param.write(str(bestalpha) + ",") param.write(str(bestalphaSE)) param.close() print("--- %s seconds ---" % (time.time() - start_time)) # save model #joblib.dump(reg, 'model.pkl') #clf = joblib.load('model.pkl') return reg
# -------------- from sklearn.linear_model import Lasso # Code starts here lasso = Lasso() lasso.fit(X_train, y_train) lasso_pred = lasso.predict(X_test) r2_lasso = lasso.score(X_test, y_test) print(r2_lasso) # -------------- from sklearn.linear_model import Ridge # Code starts here ridge = Ridge() ridge.fit(X_train, y_train) ridge_pred = ridge.predict(X_test) r2_ridge = ridge.score(X_test, y_test) print(r2_ridge) # Code ends here # -------------- from sklearn.model_selection import cross_val_score #Code starts here regressor = LinearRegression() # Initiate cross validation score score = cross_val_score(regressor, X_train, y_train, scoring='r2', cv=10) print(score) #calculate mean of the score
IQR = test_Q3 - test_Q1 exist_outlier = [ str(x[0]) for x in (temp_y < (test_Q1 - 1.5 * IQR)) | (temp_y > (test_Q3 + 1.5 * IQR)) ] if 'True' in exist_outlier: temp_x = temp_x[~((temp_y < (test_Q1 - 1.5 * IQR)) | (temp_y > (test_Q3 + 1.5 * IQR)))].reshape(-1, 1) temp_y = temp_y[~((temp_y < (test_Q1 - 1.5 * IQR)) | (temp_y > (test_Q3 + 1.5 * IQR)))].reshape(-1, 1) # Build a Ridge Regressor temp_Ridge = Ridge(alpha=k, normalize=True) temp_Ridge.fit(temp_x, temp_y) temp_y_pred = temp_Ridge.predict(temp_x) Ridge_predict = temp_Ridge.predict(Year).reshape(len(Year), ).tolist() incor_pred = sum(1 for pred in Ridge_predict if pred < 0) Ridge_vary_alpha[k].append({ 'areaId': areaId, 'mse': metrics.mean_squared_error(temp_y, temp_y_pred), 'r2': metrics.r2_score(temp_y, temp_y_pred), 'num_incor': incor_pred }) num_incor_alpha = defaultdict(int)
plt.figure(figsize=(8, 4)) plt.subplot(121) plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42) plt.ylabel("$y$", rotation=0, fontsize=18) plt.subplot(122) plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42) save_fig("ridge_regression_plot") plt.show() print() from sklearn.linear_model import Ridge ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42) ridge_reg.fit(X, y) print('ridge_reg.predict([[1.5]]) = {0}'.format(ridge_reg.predict([[1.5]]))) print() sgd_reg = SGDRegressor(max_iter=50, tol=-np.infty, penalty="l2", random_state=42) sgd_reg.fit(X, y.ravel()) print('ridge_reg.predict([[1.5]]) = {0}'.format(ridge_reg.predict([[1.5]]))) print() from sklearn.linear_model import Ridge ridge_reg = Ridge(alpha=1, solver="sag", random_state=42) ridge_reg.fit(X, y)
def part2(): scaler = preprocessing.StandardScaler() #Splitting the training and testing data train_ratio = 0.3 num_rows = data.shape[0] train_set_size = int(num_rows * train_ratio) train_data = data.iloc[:train_set_size] test_data = data.iloc[train_set_size:] train_features = train_data.drop(['TARGET_D'], axis=1, inplace=False) standardize_train_features = scaler.fit_transform(train_features) train_features = pd.DataFrame(standardize_train_features) train_labels = train_data.loc[:,['TARGET_D']] test_features = test_data.drop(['TARGET_D'], axis=1, inplace=False) standardize_test_features = scaler.fit_transform(test_features) test_features = pd.DataFrame(standardize_test_features) test_labels = test_data.loc[:,['TARGET_D']] kFoldsX = train_features kFoldsY = train_labels maeTrain_list = [] maeValidation_list = [] size = int(len(train_features)/5) lambdaValues = range(-3,11) #CV with 5 folds for each lambda value for l in lambdaValues: start = 0 maeTrain = 0 maeValidation = 0 for k in range(5): CVx = kFoldsX[start:(k+1)*size] CVy = kFoldsY[start:(k+1)*size] trainX = pd.concat([kFoldsX[:start], kFoldsX[(k+1)*size:]]) trainY = pd.concat([kFoldsY[:start], kFoldsY[(k+1)*size:]]) start += size trainRidge = Ridge(alpha=(10 ** l)) trainRidge.fit(trainX, trainY) CVPredict = trainRidge.predict(CVx) trainPredict = trainRidge.predict(trainX) maeTrain += float(np.mean(abs(trainY - trainPredict))) maeValidation += float(np.mean(abs(CVy - CVPredict))) print("lambda = 10^{}, TRAIN MAE: {}".format(l, maeTrain/5)) print("lambda = 10^{}, CV MAE: {}".format(l, maeValidation/5)) maeTrain_list.append(maeTrain/5) maeValidation_list.append(maeValidation/5) #plot graph plt.plot(lambdaValues, maeTrain_list, label ='MAE Training Data') plt.plot(lambdaValues, maeValidation_list, label= 'Validation Data') plt.xlabel('Lambda') plt.ylabel('MAE') plt.legend() plt.title('Ridge Regression Graph') plt.show() print("The best value for lambda is 10^4") testRidge = Ridge(alpha=(10 ** 4)) testRidge.fit(train_features, train_labels) testPredict = testRidge.predict(test_features) newMAE = float(np.mean(abs(test_labels - testPredict))) print("The MAE value with lambda 10^4 is: {}".format(newMAE)) print("The MAE value is significantly decreased compared to the first part.")
def fit_coeff(ii): # Given current cluster of points indexed by vector ii, # compute ridge regression/softmax regression problems, one per target alphaj = alpha * Nk[j] / N if not np.all(categorical): # Initialize ridge regressor ridge = Ridge(alpha=alphaj, fit_intercept=True, normalize=False) # Initialize softmax regressor for logistic regression) h = 0 for i in range(ny): if not categorical[i]: ridge.fit(X[ii, :], Yt[ii, i]) a[j][:, h] = ridge.coef_ b[j, h] = ridge.intercept_ h += 1 else: softmax_reg = softmax_regs[j][i] softmax_reg.C = 0.5 / alphaj tot_elems = cat_values[i] # categories in entire dataset elems = np.unique( Yt[ii, i]) # categories in this cluster (ordered) n_elems = len(elems) if n_elems < numcat[i]: # Possibly missing category values in this cluster still require their # corresponding a,b coefficients/intercepts to be optimized. # Therefore, we introduce here fake data points whose values # equals the missing values (so to maintain coef/intercept order) # and with zero weight dn = numcat[i] - n_elems softmax_weights = np.ones(Nk[j] + dn) softmax_weights[0:dn] = 0.0 fake_values = np.setdiff1d(tot_elems, elems, assume_unique=True) softmax_reg.fit(np.vstack((np.zeros( (dn, nx)), X[ii, :])), np.vstack((fake_values.reshape(-1, 1), Yt[ii, i].reshape(Nk[j], 1))).ravel(), sample_weight=softmax_weights) else: # no category is missing softmax_reg.fit(X[ii, :], Yt[ii, i].ravel()) if numcat[i] == 2: # binary target # In this case LogisticRegression only returns one coeff_ and intercept_ value. # LogisticRegression associates +coeff_/+intercept_ with **second** category (True), # -coeff_/-intercept_ with **first** category (False). As category numbers are # ordered from smallest to largest, the smallest value corresponds to False. a[j][:, h] = -softmax_reg.coef_ b[j, h] = -softmax_reg.intercept_ h += 1 a[j][:, h] = softmax_reg.coef_ b[j, h] = softmax_reg.intercept_ h += 1 ########## # DEBUG ########## # Y_pred = softmax_reg.predict(X[ii, :]) # from sklearn.metrics import accuracy_score # print(accuracy_score(Y[ii, i], Y_pred)) ########## else: # multi-category softmax, each category has its own coeff_/intercept_ for t in range(numcat[i]): a[j][:, h] = softmax_reg.coef_[t, :] b[j, h] = softmax_reg.intercept_[t] h += 1 # update coefficient/intercept index return
# print(X[:10]) # print(Y[:10]) X['Memory'] = X['Memory'].apply(lambda x: float(str(x)[:-1])) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=3) print len(X_test), len(y_test) lr = LinearRegression() lr.fit(X_train, y_train) rr = Ridge( alpha=0.01 ) # higher the alpha value, more restriction on the coefficients; low alpha > more generalization, coefficients are barely # restricted and in this case linear and ridge regression resembles rr.fit(X_train, y_train) rr100 = Ridge(alpha=100) # comparison with alpha value rr100.fit(X_train, y_train) train_score = lr.score(X_train, y_train) test_score = lr.score(X_test, y_test) Ridge_train_score = rr.score(X_train, y_train) Ridge_test_score = rr.score(X_test, y_test) Ridge_train_score100 = rr100.score(X_train, y_train) Ridge_test_score100 = rr100.score(X_test, y_test) print "linear regression train score:", train_score print "linear regression test score:", test_score print "ridge regression train score low alpha:", Ridge_train_score print "ridge regression test score low alpha:", Ridge_test_score print "ridge regression train score high alpha:", Ridge_train_score100 print "ridge regression test score high alpha:", Ridge_test_score100 # plt.plot(rr.coef_,alpha=0.7,linestyle='none',marker='*',markersize=5,color='red',label=r'Ridge; $\alpha = 0.01$',zorder=7) # zorder for ordering the markers
plt.show() plt.scatter(X[:, 2], y[:, 0]) plt.title("y vs x_3: no relationship") plt.show() ###################################### # Sklearn approach from sklearn.linear_model import Ridge start = time.time() # Training clf = Ridge(alpha=1.0) log_train = clf.fit(X, y) b = clf.coef_[0] print("%.2f sec." % (time.time() - start), end=' - ') print("Coefficients for x_1, x_2, x_3 are %.3f, %.3f, %.3f, respectively" % (b[0], b[1], b[2])) # Testing y_hat = clf.predict(X) print("R square: %.3f" % r2_score( y, y_hat)) # R^2 (coefficient of determination) regression score function. ###################################### # xgboost approach
if(len(y)<20): ncv=len(y) for alpha in [0.0001,0.001,0.01,0.1, 1.0, 10.0,100.0]: # reg2=Ridge(alpha=alpha) kf = KFold(len(y), n_folds=ncv) nd=0 MSE_v=0.0 loglkl=0.0 loglkl1=0.0 for train_index, test_index in kf: X_train, X_v = X[train_index], X[test_index] y_train, y_v = y[train_index], y[test_index] reg1=Ridge(alpha=alpha) reg1.fit(X_train,y_train) predytrain=reg1.predict(X_train) STD=math.sqrt(np.dot(np.array(y_train-predytrain),np.array(y_train-predytrain))/float(len(predytrain)-1.0))+1e-10 # to avoid problems # print STD,len(predytrain) predy_v=reg1.predict(X_v) MSE_v +=np.dot(np.array(predy_v-y_v),np.array(predy_v-y_v))#/float(len(y_v)) loglkl +=loglklnormal(y_v,predy_v,STD) # loglkl1+=-np.dot(np.array(predy_v-y_v),np.array(predy_v-y_v))/STD**2/2.0+len(y_v)/2.0*math.log(1/STD**2)-len(y_v)/2.0*math.log(2.0*math.pi) # print loglkl,loglkl1 # score=-cross_validation.cross_val_score(reg2, X, y, cv=ncv,scoring='mean_squared_error') # score=np.average(score) # print alpha,MSE_v,loglkl,STD # print score if(loglkl>bestscore): bestscore=loglkl bestalpha=alpha
def fit_regression_model(reisende): X, y = preprocess(reisende) model = Ridge(alpha=10, fit_intercept=False) model.fit(X, y) return model
def myliner(): ''' 线性回归直接预测房子价格 :return: None ''' # 获取数据 lb = load_boston() # 分割数据集到训练集和测试集 x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25) print(y_train, y_test) # 进行标准化处理() # 特征值和目标值是都必须进行标准化处理,实例化两个标准化API std_x = StandardScaler() x_train = std_x.fit_transform(x_train) x_test = std_x.transform(x_test) # 目标值 std_y = StandardScaler() y_train = std_y.fit_transform(y_train.reshape(-1, 1)) y_test = std_y.transform(y_test.reshape(-1, 1)) # 要求数据必须是二维 # estimator预测 # 正规方程求解方式预测结果 lr = LinearRegression() lr.fit(x_train, y_train) print(lr.coef_) # 保存训练好的模型 #joblib.dump(lr,'./test.pkl') #导出模型 #model = joblib.load('./test.pkl') # 预测测试集的房子价格 y_lr_predict = std_y.inverse_transform(lr.predict(x_test)) print("正规方程测试集里面每个样本的预测价格:", y_lr_predict) # y_test需要转换到标准化之前的值 print("正规方程的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_lr_predict)) print("*" * 100) # 梯度下降进行房价预测 sgd = SGDRegressor() sgd.fit(x_train, y_train) print(sgd.coef_) # 预测测试集的房子价格 y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test)) print("梯度下降测试集里面每个样本的预测价格:", y_sgd_predict) print("梯度下降测的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict)) # 岭回归进行房价预测 rd = Ridge(alpha=1.0) rd.fit(x_train, y_train) print(rd.coef_) # 预测测试集的房子价格 y_rd_predict = std_y.inverse_transform(rd.predict(x_test)) print("岭回归测试集里面每个样本的预测价格:", y_rd_predict) print("岭回归测的均方误差:", mean_squared_error(std_y.inverse_transform(y_test), y_rd_predict)) return None
# start a training run root_run = Run.start_logging(workspace=ws, history_name=run_history_name) # list of numbers from 0.01 to 0.9 with a 0.05 interval alphas = np.arange(0.0, 1.0, 0.05) print('start sequential parameter sweep...') # try a bunch of alpha values in a Linear Regression (Ridge) model for alpha in alphas: print('try alpha value of {0:.2f}'.format(alpha)) # create a bunch of child runs with root_run.child_run("alpha" + str(alpha)) as run: # More data science stuff reg = Ridge(alpha=alpha) reg.fit(data["train"]["X"], data["train"]["y"]) # TODO save model preds = reg.predict(data["test"]["X"]) mse = mean_squared_error(preds, data["test"]["y"]) # End train and eval # log alpha, mean_squared_error and feature names in run history run.log("alpha", alpha) run.log("mse", mse) run.log_list("columns", columns) with open(model_file_name, "wb") as file: joblib.dump(value=reg, filename=file) # upload the serialized model into run history record run.upload_file(name="outputs/" + model_file_name,
y = boston.target x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=66, shuffle=True, test_size=0.2) from sklearn.linear_model import LinearRegression, Ridge, Lasso # 모델 model1 = LinearRegression() model2 = Ridge() model3 = Lasso() model1.fit(x_train, y_train) model2.fit(x_train, y_train) model3.fit(x_train, y_train) linear_score = model1.score(x_test, y_test) ridge_score = model2.score(x_test, y_test) lasso_score = model3.score(x_test, y_test) # 평가 print('linear_score: ', linear_score) print('ridge_score: ', ridge_score) print('lasso_score: ', lasso_score) # y_pred = model1.predict(x_test) # print(y_pred)
# Make a Ridge model and run k-fold validation. cv = KFold(n_splits=10, shuffle=True, random_state=42) model = None for train_ids, valid_ids in cv.split(X_train): model = Ridge( solver='auto', fit_intercept=True, alpha=0.5, max_iter=100, normalize=False, tol=0.05) model.fit(X_train[train_ids], y_train[train_ids]) y_pred_valid = model.predict(X_train[valid_ids]) rmsle = get_rmsle(y_pred_valid, y_train[valid_ids]) print(f'valid rmsle: {rmsle:.5f}') break # Predict on test set. test = pd.read_table(input_folder + 'test.tsv') X_test = extract_test_features(test, train_vectorizer) test_ids = test['test_id'].values y_pred_test = model.predict(X_test[test_ids]) print(y_pred_test) result = pd.DataFrame(
def accuracy_on_crimes(): logger.info("Finding datasets...") directory = os.fsencode('input/Crimes_Workload') directory_sub = os.fsencode('input/Subqueries/') patterns = {'gauss-gauss': '*x-gauss*-length-gauss*', 'gauss-uni': '*x-gauss*-length-uniform*', 'uni-gauss': '*x-uniform*-length-gauss*', 'uni-uni': '*x-uniform*-length-uniform*',} train_datasets = {} test_datasets = {} sub_datasets = {} for p in patterns: res = [os.fsdecode(n) for n in os.listdir(directory) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])] train_datasets[p] = res[0] if res[0].startswith('train') else res[1] test_datasets[p] = res[0] if res[0].startswith('test') else res[1] sub_datasets[p] = [os.fsdecode(n) for n in os.listdir(directory_sub) if fnmatch.fnmatch(os.fsdecode(n), patterns[p])][0] res_eval = {'model': [], 'dataset': [], 'aggregate_name': [], 'kl': [], 'r2':[], 'md':[], 'nrmse':[]} #Main for p in patterns: logger.info('Beginning Evaluation for {0}'.format(p)) logger.info('Loading Datasets...') test_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(test_datasets[p]), index_col=0) train_df = pd.read_csv('/home/fotis/dev_projects/explanation_framework/input/Crimes_Workload/{0}'.format(train_datasets[p]), index_col=0) sub = np.load('/home/fotis/dev_projects/explanation_framework/input/Subqueries/{0}'.format(sub_datasets[p])) logger.info('Finished loading\nCommencing Evaluation') aggregates = ['count','sum_','avg'] agg_map = {'count' :4, 'sum_':5, 'avg':6} for agg in aggregates: logger.info("Evaluating Aggregates : {0}".format(agg)) X_train = train_df[['x','y','x_range','y_range']].values y_train = train_df[agg].values sc = StandardScaler() sc.fit(X_train) X_train = sc.transform(X_train) #Training Models logger.info("Model Training Initiation\n=====================") kmeans = KMeans() lr = Ridge() lsnr = PR(lr) lsnr.fit(X_train,y_train) lr_global = LinearRegression() lr_global.fit(X_train, y_train) logger.info("Accuracy Evaluation on Test set\n=====================") for i in range(1000): #Obtain query from test-set dataset = p printProgressBar(i, 1000,prefix = 'Progress:', suffix = 'Complete', length = 50) q = test_df.iloc[i].values[:4].reshape(1,-1) q = sc.transform(q) #Obtain subquery pertubations for query q from test set q1 = sub[i] X = q1[:,:4] y = q1[:,agg_map[agg]] X = sc.transform(X) # Train local model (Should be the best out of the 3) lr = LinearRegression() lr.fit(X,y) y_hat = lr.predict(X) metrics_for_model('local',dataset,agg,y_hat,X, y, lr,res_eval) #Obtain metrics for our y_hat_s = lsnr.get_model(q).predict(X) metrics_for_model('ours',dataset,agg,y_hat_s,X,y,lsnr.get_model(q) ,res_eval) #Obtain metrics for global y_hat_g = lr_global.predict(X) metrics_for_model('global',dataset,agg,y_hat_g,X,y,lr_global,res_eval) logger.info("Finished Queries") eval_df = pd.DataFrame(res_eval) eval_df.to_csv('output/Accuracy/evaluation_results_linear.csv')
y = file['y'] lamda = [] lamda.extend([0.1, 1, 10, 100, 1000]) avg = [] RMSE = [] for l in range(0, 5): clf = Ridge(lamda[l]) kf = KFold(10) #i=1 #kf.folds returns 10 folds each one of them containing two arrays - # one with the indices needed for the training set and one with the indices for the test set for train_index, test_index in kf.split( X): #trainindex: array of indexes of traindata, #print("Fold", i, ":") #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] clf.fit(X_train, y_train) y_pred = clf.predict(X_test) #i=i+1 RMSE.append(mean_squared_error(y_pred, y_test)**0.5) avg.append(np.mean(RMSE)) # output results d = {'error': avg} output = pd.DataFrame(d) output.to_csv('task_1a_output.csv', index=False, header=False)
n_informative= 1, #The number of informative features, i.e., the number of features used to #build the linear model used to generate the output. (default = 10) n_targets= 1, #The number of regression targets, i.e., the dimension of the y output vector #associated with a sample. By default, the output is a scalar. (default = 1) noise= 5, #The standard deviation of the gaussian noise applied to the output. (default = 0.0) coef= True, #If True, the coefficients of the underlying linear model are returned. (default = False) random_state=1 #Determines random number generation for dataset creation. ) #This is identitical to linear regression. A high alpha makes bias high. rr = Ridge(alpha=1) rr.fit(X, y) w = rr.coef_[0] plt.scatter(X, y) #regression line, will be same as linear regression since alpha is 1 plt.plot(X, X * w, c='red') plt.show() #Increasing alpha gives us a less steep slope which increases bias a little to hopefully #decrease variance by a lot rr = Ridge(alpha=10) rr.fit(X, y) w = rr.coef_[0] plt.scatter(X, y) plt.plot(X, X * w, c='red') plt.show()
temp = np.delete(temp, np.argwhere(temp[:, 1] <= 0.1632), 0) X_embedding = temp[:, 0:n_efeatures] X_train = temp[:, n_efeatures:n_efeatures + n_features] y_train = temp[:, -1] print(X_embedding.shape) print(y_train.shape) print(X_train.shape) n_samples = y_train.shape[0] n_train = int(n_samples * 0.8) x_t = X_train[0:n_train] x_e = X_embedding[0:n_train] y_t_test = y_train[n_train:] x_t_test = X_train[n_train:] x_e_test = X_embedding[n_train:] lasso_reg = Ridge(alpha=1, solver="cholesky") # lasso_reg = Lasso(alpha=0.1) lasso_reg.fit(x_t, x_e) x_e_test_predict = lasso_reg.predict(x_t_test) print(np.linalg.norm(x_e_test - x_e_test_predict)**2 / x_e_test.shape[0]) plot_embedding(x_e_test, y_t_test.ravel().tolist(), "SLE_for_digital(alpha=0.02)") plot_embedding(x_e_test_predict, y_t_test.ravel().tolist(), "Ridge_for_SLE(alpha=0.02)") plt.show()
def create_model(df, y, X, X_train, X_test, y_train, y_test, degree, random_state, test_size, alpha): linreg = LinearRegression() linreg.fit(X_train, y_train) ss = StandardScaler() ss.fit(X_train) X_train_scaled = ss.transform(X_train) X_test_scaled = ss.transform(X_test) linreg_norm = LinearRegression() linreg_norm.fit(X_train_scaled, y_train) X_cat = df[['Month', 'Origin', 'Dest']] X_train_cat, X_test_cat, y_train, y_test = train_test_split( X_cat, y, test_size=test_size, random_state=random_state) # OneHotEncode Categorical variables ohe = OneHotEncoder(handle_unknown='ignore') ohe.fit(X_train_cat) X_train_ohe = ohe.transform(X_train_cat) X_test_ohe = ohe.transform(X_test_cat) columns = ohe.get_feature_names(input_features=X_train_cat.columns) cat_train_df = pd.DataFrame(X_train_ohe.todense(), columns=columns) cat_test_df = pd.DataFrame(X_test_ohe.todense(), columns=columns) X_train_all = pd.concat([pd.DataFrame(X_train_scaled), cat_train_df], axis=1) X_test_all = pd.concat([pd.DataFrame(X_test_scaled), cat_test_df], axis=1) linreg_all = LinearRegression() linreg_all.fit(X_train_all, y_train) print('Baseline model Continuous and Categorical') print('Training r^2:', linreg_all.score(X_train_all, y_train)) print('Testing r^2:', linreg_all.score(X_test_all, y_test)) print('Training MSE:', mean_squared_error(y_train, linreg_all.predict(X_train_all))) print('Testing MSE:', mean_squared_error(y_test, linreg_all.predict(X_test_all))) print("\n") lasso = Lasso(alpha=alpha) #Lasso is also known as the L1 norm. lasso.fit(X_train_all, y_train) print('Lasso') print('Training r^2:', lasso.score(X_train_all, y_train)) print('Testing r^2:', lasso.score(X_test_all, y_test)) print('Training MSE:', mean_squared_error(y_train, lasso.predict(X_train_all))) print('Testing MSE:', mean_squared_error(y_test, lasso.predict(X_test_all))) print("\n") ridge = Ridge(alpha=alpha) #Ridge is also known as the L2 norm. ridge.fit(X_train_all, y_train) print('Ridge') print('Training r^2:', ridge.score(X_train_all, y_train)) print('Testing r^2:', ridge.score(X_test_all, y_test)) print('Training MSE:', mean_squared_error(y_train, ridge.predict(X_train_all))) print('Testing MSE:', mean_squared_error(y_test, ridge.predict(X_test_all))) print("\n") poly_features = PolynomialFeatures(degree) # transforms the existing features to higher degree features. X_train_poly = poly_features.fit_transform(X_train) # fit the transformed features to Linear Regression poly_model = LinearRegression() poly_model.fit(X_train_poly, y_train) # predicting on training data-set y_train_predicted = poly_model.predict(X_train_poly) # predicting on test data-set y_test_predict = poly_model.predict(poly_features.fit_transform(X_test)) # evaluating the model on training dataset rmse_train = np.sqrt(mean_squared_error(y_train, y_train_predicted)) r2_train = r2_score(y_train, y_train_predicted) # evaluating the model on test dataset rmse_test = np.sqrt(mean_squared_error(y_test, y_test_predict)) r2_test = r2_score(y_test, y_test_predict) print("\n") print(" Polynomial training set") print("MSE of training set is {}".format(rmse_train)) print("R2 score of training set is {}".format(r2_train)) print("\n") print("Polynomial test set") print("MSE of test set is {}".format(rmse_test)) print("R2 score of test set is {}".format(r2_test)) print("\n") print('Cross Validation for Polynomial model') lm = LinearRegression() # store scores in scores object # we can't use accuracy as our evaluation metric since that's only relevant for classification problems # RMSE is not directly available so we will use MSE scores = cross_val_score(lm, X_train_poly, y_train, cv=10, scoring='r2') mse_scores = cross_val_score(lm, X_train_poly, y_train, cv=10, scoring='neg_mean_squared_error') print('Cross Validation Mean r2:', np.mean(scores)) print('Cross Validation Mean MSE:', np.mean(mse_scores)) print('Cross Validation 10 Fold Score:', scores) print('Cross Validation 10 Fold mean squared error', -(mse_scores))
std_x = sc.fit_transform(x) pca = PCA() pca_x = pca.fit_transform(std_x) # Compute percentile rankings for pca-space features xp = feature_percentiles(pca_x) # Run gridsearch on ridge regression to find optimal hp's gs = GridSearchCV(Ridge(), {'alpha': [0.1, 0.3, 0.6, 1, 3, 6.0, 10, 30, 60, 100]}, n_jobs=1, cv=10, scoring='neg_mean_squared_error') gs.fit(pca_x, logy) # Store PCA component compositions and feature importance ranks r = Ridge(**gs.best_estimator_.get_params()) r.fit(pca_x, logy) feat_imp = np.reshape(r.coef_, (1, len(r.coef_))) pca_comp = pca.components_ pcfi_US = pd.DataFrame(np.vstack((feat_imp, pca_comp)), columns=df_us.columns[1:-1]) # Store pca-space features pc_labels = ['PC#' + str(i) for i in range(1,xp.shape[1]+1)] data_US = pd.DataFrame(np.hstack((np.reshape(df_us.values[:,0],(xp.shape[0],1)),xp)), columns=[df_us.columns[0]]+pc_labels) data_US[geo] = pd.to_numeric(data_US[geo], downcast='integer') # Write results files to disk data_US.to_csv('data_' + 'US' + '.csv', index=False, float_format='%.2f') pcfi_US.to_csv('pcfi_' + '_' + 'US' + '.csv', index=True, index_label='PC', float_format='%.5f') print('National complete.')
from sklearn import datasets from sklearn.linear_model import Ridge from sklearn.model_selection import cross_val_predict from sklearn.metrics import r2_score X, y = datasets.load_diabetes(return_X_y=True) n_alphas = 20 rr_alphas = alphas = np.logspace(-10, 10, n_alphas) rr_coefs = [] rr_coefs = np.zeros((X.shape[-1], n_alphas)) rr_pred = np.zeros((y.shape[-1], n_alphas)) for aa in range(len(rr_alphas)): RR = Ridge(alpha=rr_alphas[aa], fit_intercept=True) RR.fit(X, y) rr_coefs[:, aa] = RR.coef_ rr_pred[:, aa] = cross_val_predict(RR, X, y) fracs = np.linspace(0, 1, n_alphas) FR = FracRidge(fracs=fracs, fit_intercept=True) FR.fit(X, y) fr_pred = cross_val_predict(FR, X, y) fig, ax = plt.subplots(1, 2) ax[0].plot(fracs, FR.coef_.T) ylims = ax[0].get_ylim() ax[0].vlines(fracs, ylims[0], ylims[1], linewidth=0.5, color='gray') ax[0].set_ylim(*ylims) ax[1].plot(np.log(rr_alphas[::-1]), rr_coefs.T)
'pH', 'sulphates', 'alcohol'] pdx = wine_quality[all_colnms] pdy = wine_quality["quality"] x_train,x_test,y_train,y_test = train_test_split(pdx,pdy,train_size = 0.7,random_state=42) alphas = [1e-4,1e-3,1e-2,0.1,0.5,1.0,5.0,10.0] initrsq = 0 print ("\nRidge Regression: Best Parameters\n") for alph in alphas: ridge_reg = Ridge(alpha=alph) ridge_reg.fit(x_train,y_train) tr_rsqrd = ridge_reg.score(x_train,y_train) ts_rsqrd = ridge_reg.score(x_test,y_test) if ts_rsqrd > initrsq: print ("Lambda: ",alph,"Train R-Squared value:",round(tr_rsqrd,5),"Test R-squared value:",round(ts_rsqrd,5)) initrsq = ts_rsqrd # Coeffients of Ridge regression of best alpha value ridge_reg = Ridge(alpha=0.001) ridge_reg.fit(x_train,y_train) print ("\nRidge Regression coefficient values of Alpha = 0.001\n") for i in range(11): print (all_colnms[i],": ",ridge_reg.coef_[i])
# ls = Lasso() # ls.fit(X_train,y_train) # ls_pred = ls.predict(X_test) # print('Lasso Regression Performance:') # print('MAE:', metrics.mean_absolute_error(y_test, ls_pred)) # print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, ls_pred))) # print('R2_Score: ', metrics.r2_score(y_test, ls_pred)) # fig = plt.figure(figsize=(8, 5)) # sns.regplot(y_test,ls_pred,color='g') # plt.xlabel('COA') # plt.ylabel('Predictions') # plt.title('Lasso Prediction Performance ') # plt.grid() rg = Ridge() rg.fit(X_train, y_train) rg_pred = rg.predict(X_test) print('Ridge Regression Performance:') print('MAE:', metrics.mean_absolute_error(y_test, rg_pred)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rg_pred))) print('R2_Score: ', metrics.r2_score(y_test, rg_pred)) fig = plt.figure(figsize=(8, 5)) sns.regplot(y_test, rg_pred, color='g') plt.xlabel('COA') plt.ylabel('Predictions') plt.title('Ridge Prediction Performance ') plt.grid() # rf = RandomForestRegressor(n_estimators=100) # rf.fit(X_train,y_train) # rf_pred = rf.predict(X_test)
class Regressor(): """ Wraps scikitlearn regressors. Parameters ---------- strategy : string, defaut = "LightGBM" (if installed else "XGBoost") The choice for the regressor. Available strategies = "LightGBM" (if installed), "XGBoost", "RandomForest", "ExtraTrees", "Tree", "Bagging", "AdaBoost" or "Linear" **params : parameters of the corresponding regressor. Examples : n_estimators, max_depth... """ def __init__(self, **params): if ("strategy" in params): self.__strategy = params["strategy"] else: if (lgbm_installed): self.__strategy = "LightGBM" else: self.__strategy = "XGBoost" self.__regress_params = {} self.__regressor = None self.__set_regressor(self.__strategy) self.__col = None self.set_params(**params) self.__fitOK = False def get_params(self, deep=True): params = {} params["strategy"] = self.__strategy params.update(self.__regress_params) return params def set_params(self, **params): self.__fitOK = False if 'strategy' in params.keys(): self.__set_regressor(params['strategy']) for k, v in self.__regress_params.items(): if k not in self.get_params().keys(): warnings.warn("Invalid parameter for regressor " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`regressor.get_params().keys()`") else: setattr(self.__regressor, k, v) for k, v in params.items(): if (k == "strategy"): pass else: if k not in self.__regressor.get_params().keys(): warnings.warn("Invalid parameter for regressor " + str(self.__strategy) + ". Parameter IGNORED. Check the list of " "available parameters with " "`regressor.get_params().keys()`") else: setattr(self.__regressor, k, v) self.__regress_params[k] = v def __set_regressor(self, strategy): self.__strategy = strategy if (strategy == 'RandomForest'): self.__regressor = RandomForestRegressor(n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif (strategy == 'XGBoost'): self.__regressor = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, colsample_bytree=0.8, colsample_bylevel=1., subsample=0.9, nthread=-1, seed=0) elif (strategy == "LightGBM"): if (lgbm_installed): self.__regressor = LGBMRegressor(n_estimators=500, learning_rate=0.05, colsample_bytree=0.8, subsample=0.9, nthread=-1, seed=0) else: warnings.warn( "Package lightgbm is not installed. Model LightGBM will be" "replaced by XGBoost") self.__strategy = "XGBoost" self.__regressor = XGBRegressor(n_estimators=500, max_depth=6, learning_rate=0.05, colsample_bytree=0.8, colsample_bylevel=1., subsample=0.9, nthread=-1, seed=0) elif (strategy == 'ExtraTrees'): self.__regressor = ExtraTreesRegressor(n_estimators=400, max_depth=10, max_features='sqrt', bootstrap=True, n_jobs=-1, random_state=0) elif (strategy == 'Tree'): self.__regressor = DecisionTreeRegressor( criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=0, max_leaf_nodes=None, presort=False) elif (strategy == "Bagging"): self.__regressor = BaggingRegressor(base_estimator=None, n_estimators=500, max_samples=.9, max_features=.85, bootstrap=False, bootstrap_features=False, n_jobs=-1, random_state=0) elif (strategy == "AdaBoost"): self.__regressor = AdaBoostRegressor(base_estimator=None, n_estimators=400, learning_rate=.05, random_state=0) elif (strategy == "Linear"): self.__regressor = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=0) else: raise ValueError( "Strategy invalid. Please choose between 'LightGBM' " "(if installed), 'XGBoost', 'RandomForest', 'ExtraTrees', " "'Tree', 'Bagging', 'AdaBoost' or 'Linear'") def fit(self, df_train, y_train): """ Fits Regressor. Parameters ---------- df_train : pandas dataframe of shape = (n_train, n_features) The train dataset with numerical features. y_train : pandas series of shape = (n_train, ) The target for regression tasks. Returns ------- self """ # sanity checks if ((type(df_train) != pd.SparseDataFrame) and (type(df_train) != pd.DataFrame)): raise ValueError("df_train must be a DataFrame") if (type(y_train) != pd.core.series.Series): raise ValueError("y_train must be a Series") self.__regressor.fit(df_train.values, y_train) self.__col = df_train.columns self.__fitOK = True return self def feature_importances(self): """ Computes feature importances. Regressor must be fitted before. Parameters ---------- None Returns ------- importance : dict Dictionnary containing a measure of feature importance (value) for each feature (key). """ if self.__fitOK: if (self.get_params()["strategy"] in ["Linear"]): importance = {} f = np.abs(self.get_estimator().coef_) for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in [ "LightGBM", "XGBoost", "RandomForest", "ExtraTrees", "Tree" ]): importance = {} f = self.get_estimator().feature_importances_ for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["AdaBoost"]): importance = {} norm = self.get_estimator().estimator_weights_.sum() try: # XGB, RF, ET, Tree and AdaBoost # TODO: Refactor this part f = sum( weight * est.feature_importances_ for weight, est in zip( self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa except Exception: f = sum(weight * np.abs(est.coef_) for weight, est in zip( self.get_estimator().estimator_weights_, self.get_estimator().estimators_)) / norm # noqa for i, col in enumerate(self.__col): importance[col] = f[i] elif (self.get_params()["strategy"] in ["Bagging"]): importance = {} importance_bag = [] for i, b in enumerate(self.get_estimator().estimators_): d = {} try: # XGB, RF, ET, Tree and AdaBoost f = b.feature_importances_ except Exception: f = np.abs(b.coef_) # Linear estimator = self.get_estimator() items = enumerate(estimator.estimators_features_[i]) for j, c in items: d[self.__col[c]] = f[j] importance_bag.append(d.copy()) for i, col in enumerate(self.__col): importance[col] = np.mean( filter(lambda x: x != 0, [ k[col] if col in k else 0 for k in importance_bag ])) else: importance = {} return importance else: raise ValueError("You must call the fit function before !") def predict(self, df): ''' Predicts the target. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- y : array of shape = (n, ) The target to be predicted. ''' try: if not callable(getattr(self.__regressor, "predict")): raise ValueError("predict attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.predict(df.values) else: raise ValueError("You must call the fit function before !") def transform(self, df): ''' Transforms df. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. Returns ------- df_transform : pandas dataframe of shape = (n, n_selected_features) The transformed dataset with its most important features. ''' try: if not callable(getattr(self.__regressor, "transform")): raise ValueError("transform attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) & (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") return self.__regressor.transform(df.values) else: raise ValueError("You must call the fit function before !") def score(self, df, y, sample_weight=None): """ Returns the coefficient of determination R^2 of the prediction. Parameters ---------- df : pandas dataframe of shape = (n, n_features) The dataset with numerical features. y : pandas series of shape = (n,) The numerical encoded target for classification tasks. Returns ------- score : float R^2 of self.predict(df) wrt. y. """ try: if not callable(getattr(self.__regressor, "score")): raise ValueError("score attribute is not callable") except Exception as e: raise e if self.__fitOK: # sanity checks if ((type(df) != pd.SparseDataFrame) and (type(df) != pd.DataFrame)): raise ValueError("df must be a DataFrame") if (type(y) != pd.core.series.Series): raise ValueError("y must be a Series") return self.__regressor.score(df.values, y, sample_weight) else: raise ValueError("You must call the fit function before !") def get_estimator(self): return copy(self.__regressor)
################################################## RIDGE REGRESSION # PARAMETER TUNING features = ['c1','c2','c3','c4','c5','c6','c7','c8'] msk = np.random.rand(len(tf)) < 0.8 train = tf[msk].reset_index(drop=True) test = tf[~msk].reset_index(drop=True) row_list = [] for n in range(0,1001): clf = Ridge(alpha=n) clf.fit(train[features],train.nrtg) score = clf.score(test[features],test.nrtg) dict1 = {'alpha':n,'score':score} row_list.append(dict1) alpha_df = pd.DataFrame(row_list) alpha = alpha_df[alpha_df.score == alpha_df.score.max()].alpha.values[0] # RIDGE REGRESSION clf = Ridge(alpha=alpha) clf.fit(tf[features],tf.nrtg) coefficients = clf.coef_