def add_overall_trend_feature(df_shop, target='pays_count'): biweek_max = df_shop.biweek_id.max() trend_name = 'trend_overall' coeff_name = 'trend_overall_coeff' df_shop[trend_name] = np.nan df_shop[coeff_name] = np.nan for m in range(biweek_max - 1, 0, -1): train_idx = df_shop.biweek_id >= m test_idx = df_shop.biweek_id == (m - 1) df_train = df_shop[train_idx] y = df_train[target] not_null = ~y.isnull() if not_null.sum() <= 7: continue x = df_train.days_from_beginning x_not_null = x[not_null].values.reshape(-1, 1) y = y[not_null].values lr = Ridge(alpha=1).fit(x_not_null, y) if m == biweek_max - 1: x = x.values.reshape(-1, 1) df_shop.loc[train_idx, trend_name] = lr.predict(x) df_shop.loc[train_idx, coeff_name] = lr.coef_[0] df_test = df_shop[test_idx] x = df_test.days_from_beginning.values.reshape(-1, 1) df_shop.loc[test_idx, trend_name] = lr.predict(x) df_shop.loc[test_idx, coeff_name] = lr.coef_[0]
def add_weekly_overall_trends(df_shop, regressor, trend_name, coeff_name, target='pays_count'): biweek_max = df_shop.biweek_id.max() df_shop[trend_name] = np.nan df_shop[coeff_name] = np.nan for m in range(biweek_max - 1, 0, -1): train_idx = df_shop.biweek_id >= m test_idx = df_shop.biweek_id == (m - 1) df_train = df_shop[train_idx] y = df_train[target] not_null = ~y.isnull() if not_null.sum() < 7: continue x = -df_train[regressor] x_not_null = x[not_null].values.reshape(-1, 1) y = y[not_null].values lr = Ridge(alpha=1).fit(x_not_null, y) if m == biweek_max - 1: x = x.values.reshape(-1, 1) df_shop.loc[train_idx, trend_name] = lr.predict(x) df_shop.loc[train_idx, coeff_name] = lr.coef_[0] df_test = df_shop[test_idx] x = -df_test[regressor].values.reshape(-1, 1) df_shop.loc[test_idx, trend_name] = lr.predict(x) df_shop.loc[test_idx, coeff_name] = lr.coef_[0]
def add_window_trend_overall_features(df_shop, target='pays_count'): biweek_max = df_shop.biweek_id.max() for biweeks_past in [2, 3, 4, 5, 6, 12, 18]: trend_name = 'trend_%d' % biweeks_past trend_coef_name = 'trend_coef_%d' % biweeks_past df_shop[trend_name] = np.nan df_shop[trend_coef_name] = np.nan for m in range(biweek_max, biweeks_past, -1): m_past = m - biweeks_past train_idx = (df_shop.biweek_id >= m_past) & (df_shop.biweek_id <= m) test_idx = df_shop.biweek_id == (m_past - 1) df_rolling_train = df_shop[train_idx] df_rolling_test = df_shop[test_idx] y = df_rolling_train[target] not_null = ~y.isnull() if not_null.sum() <= 7: continue x = df_rolling_train.days_from_beginning x_not_null = x[not_null].values.reshape(-1, 1) y = y[not_null].values lr = Ridge(alpha=1).fit(x_not_null, y) if m == biweek_max: x = x.values.reshape(-1, 1) df_shop.loc[train_idx, trend_name] = lr.predict(x) df_shop.loc[train_idx, trend_coef_name] = lr.coef_[0] x_val = df_rolling_test.days_from_beginning.values.reshape(-1, 1) df_shop.loc[test_idx, trend_name] = lr.predict(x_val) df_shop.loc[test_idx, trend_coef_name] = lr.coef_[0]
def Ridge_model(train_linear, test_linear): ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(train_linear_fea, train_linear_tar) ridgecv_score = ridgecv.score(train_linear_fea, train_linear_tar) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) #ridge.set_params(alpha=6,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) write_pkl(ridgecv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/ridge_params.pkl') return test_prediction_ridge
class OrderScorer(Scorer): def __init__(self): self.classifier = Ridge(alpha=0.1) self.cache_filename = 'subgraph_order_scorer_reg.pickle' def train(self, train_instances, train_labels, update_cache=True, sample_weight=None): """ Trains a scorer to score the quality of an ordering of sentences Loads from cache if available """ self.classifier.fit(train_instances, train_labels, sample_weight=sample_weight) if update_cache: pickle.dump(self.classifier, open(self.cache_filename, 'wb')) def test(self, test_instances, test_labels): """ Uses test set to evaluate the performance of the scorer and print it out """ scores = self.classifier.predict(test_instances) # TODO: print report def load(self): if os.path.exists(self.cache_filename): self.classifier = pickle.load(open(self.cache_filename, 'rb')) else: raise Exception("No classifier exists! Must call train with update_cache=True") def evaluate(self, test_instance): """ Applies the scoring function to a given test instance """ return self.classifier.predict([test_instance])[0]
def forecast_future_attention(train_index, test_index, alpha): """Forecast future attention via train dataset index and test dataset index.""" m, n = len(train_index), len(test_index) x_train_predict = attention_data[train_index, :num_train] x_test_predict = attention_data[test_index, :num_train] for i in xrange(num_train, age): if with_share == 1: x_train = np.hstack((x_train_predict, share_data[train_index, :i + 1])) x_test = np.hstack((x_test_predict, share_data[test_index, :i + 1])) norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1), share_data[train_index, :i + 1])) else: x_train = x_train_predict x_test = x_test_predict norm = np.hstack((x_train[:, :i], attention_data[train_index, i].reshape(m, 1))) x_train_norm = x_train / np.sum(norm, axis=1)[:, None] y_train = np.ones(m, ) # == == == == == == == == Training with Ridge Regression == == == == == == == == # predictor = Ridge(fit_intercept=False, alpha=alpha) predictor.fit(x_train_norm, y_train) # == == == == == == == == Iteratively add forecasted value to x matrix == == == == == == == == # predict_train_value = (predictor.predict(x_train) - np.sum(x_train, axis=1)).reshape(m, 1) predict_train_value[predict_train_value < 0] = 0 x_train_predict = np.hstack((x_train_predict, predict_train_value)) predict_test_value = (predictor.predict(x_test) - np.sum(x_test, axis=1)).reshape(n, 1) predict_test_value[predict_test_value < 0] = 0 x_test_predict = np.hstack((x_test_predict, predict_test_value)) return x_test_predict[:, num_train: age]
class TriFiLearn: def __init__(self): print "requesting x training set" # setX = pd.read_csv("http://localhost:8080/csv/dimension/x/version/floor10-test") setX = pd.read_csv("csv/training-set-x-version-floor10-test.csv") labelsX = setX['x'].values featuresX = setX.iloc[:, 1:].values self.modelX = Ridge(normalize=True).fit(featuresX, labelsX) print "requesting y training set" # setY = pd.read_csv("http://localhost:8080/csv/dimension/y/version/floor10-test") setY = pd.read_csv("csv/training-set-y-version-floor10-test.csv") labelsY = setY['y'].values featuresY = setY.iloc[:, 1:].values self.modelY = Ridge(normalize=True).fit(featuresY, labelsY) def predictX(self, features): return self.modelX.predict(features) def predictY(self, features): return self.modelY.predict(features) def trainX(self, label, features): labelsX.append(label) featuresX.append(features) self.modelX = Ridge(normalize=True).fit(featuresX, labelsX) def trainY(self, label, features): labelsY.append(label) featuresY.append(features) self.modelY = Ridge(normalize=True).fit(featuresY, labelsY) def reset(self): self.__init__()
def ridge_regression(data, predictors, alpha, models_to_plot={},test=None): #Fit the model ridgereg = Ridge(alpha=alpha,normalize=True) ridgereg.fit(data[predictors],data['y']) y_pred = ridgereg.predict(data[predictors]) n = len(data['x']) #Check if a plot is to be made for the entered alpha if alpha in models_to_plot: #plt.subplot(models_to_plot[power]) #plt.tight_layout() plt.figure() plt.plot(data['x'],data['y'],'.',markersize=15) plt.plot(data['x'],y_pred,color='red',linewidth=2) #plt.title('power: %d'%power) plt.savefig('./fig/ridge_alpha' + models_to_plot[alpha] + '.png',dpi=dpi) #Return the result in pre-defined format rss = (1.0/n)*sum((y_pred-data['y'])**2) ret = [rss] ret.extend([ridgereg.intercept_]) ret.extend(ridgereg.coef_) if test is not None: nt = len(test['x']) y_test = ridgereg.predict(test[predictors]) test_rss = (1.0/nt)*sum((y_test - test['y'])**2) ret.extend([test_rss]) return ret
def reg_lin(): Xtrain, Xtest, Ytrain, Ytest = skcv.train_test_split(X, Y, train_size=.8) regressor = Ridge(alpha=1) regressor.fit(Xtrain,np.log(Ytrain)) Ypred = np.array(regressor.predict(Xtest),dtype=float) print logscore( Ytest, np.exp(Ypred ) ) validate = load_data('validate') validate = transformFeatures(validate) np.savetxt('results/validate.txt', np.exp(np.array( regressor.predict(validate), dtype=np.dtype('d'))))
class ThreeRidgeEstimator(BaseEstimator): ''' Three Ridge estimator for each class of variable ''' def __init__(self, alpha1=1.0, alpha2=1.0, alpha3=1.0): ''' Initializes a new instance of this estimator alpha1: alpha parameter for the first ridge alpha2: alpha parameter for the second ridge alpha3: alpha parameter for the third ridge ''' self.alpha1 = alpha1 self.alpha2 = alpha2 self.alpha3 = alpha3 self.models = [] def fit(self, X, Y): self.model1 = Ridge(alpha=self.alpha1) self.model1.fit(X, Y[:, 0:5]) self.model2 = Ridge(alpha=self.alpha2) self.model2.fit(X, Y[:, 5:9]) 'fit k' self.model3 = Ridge(alpha=self.alpha3) self.model3.fit(X, Y[:, 9:]) def predict(self, X): pred_s = self.model1.predict(X) pred_w = self.model2.predict(X) pred_k = self.model3.predict(X) pred_s_sum = pred_s.sum(axis=1)[:, np.newaxis] pred_s /= pred_s_sum pred_w_sum = pred_w.sum(axis=1)[:, np.newaxis] pred_w /= pred_w_sum predictions = np.hstack((pred_s, pred_w, pred_k)) return predictions
class MyRegression(object): def __init__(self, x_data, y_data): self.x_data = np.array(x_data) self.y_data = np.array(y_data) self.v_data = [x[0] for x in x_data] print "old accuracy", get_accuracy(self.v_data, self.y_data) def ols_linear_reg(self): self.lr = LinearRegression() self.lr.fit(self.x_data, self.y_data) adjusted_result = self.lr.predict(self.x_data) print "lr params", self.lr.coef_, self.lr.intercept_ print "lr accuracy", get_accuracy(adjusted_result, self.y_data) return map(int, list(adjusted_result)) def bayes_ridge_reg(self): br = BayesianRidge() br.fit(self.x_data, self.y_data) adjusted_result = br.predict(self.x_data) print "bayes ridge params", br.coef_, br.intercept_ print "bayes ridge accuracy", get_accuracy(adjusted_result, self.y_data) return map(int, list(adjusted_result)) def linear_ridge_reg(self): self.rr = Ridge() self.rr.fit(self.x_data, self.y_data) adjusted_result = self.rr.predict(self.x_data) print "ridge params", self.rr.coef_, self.rr.intercept_ print "ridge accuracy", get_accuracy(adjusted_result, self.y_data) return map(int, list(adjusted_result))
def ridgereg(a): print("Doing ridge regression") clf = Ridge(alpha=a) clf.fit(base_X, base_Y) print ("Score = %f" % clf.score(base_X, base_Y)) clf_pred = clf.predict(X_test) write_to_file("ridge.csv", clf_pred)
def test_brr_like_sklearn(): n = 10000 d = 10 sigma_sqr = 5 X = np.random.randn(n, d) beta_true = np.random.random(d) y = np.dot(X, beta_true) + np.sqrt(sigma_sqr) * np.random.randn(n) X_tr = X[:n / 2, :] y_tr = y[:n / 2] X_ts = X[n / 2:, :] # y_ts = y[n / 2:] # prediction with my own bayesian ridge lambda_reg = 1 brr = BayesianRidgeRegression(lambda_reg, add_ones=True, normalize_lambda=False) brr.fit(X_tr, y_tr) y_ts_brr = brr.predict(X_ts) # let's compare to scikit-learn's ridge regression rr = Ridge(lambda_reg) rr.fit(X_tr, y_tr) y_ts_rr = rr.predict(X_ts) assert np.mean(np.abs(y_ts_brr - y_ts_rr)) < 0.001, \ "Predictions are different from sklearn's ridge regression."
def training(X,Y,X_test, pca='kpca', regressor='ridge', dim=50): # X and Y are numpy arrays print 'Input data and label shape: ', X.shape, Y.shape if pca == 'nopca': return simpleTraining(X, Y, X_test, regressor) model, P = getProjectionMatrixPCA(Y, dim) if pca=='pca' else getProjectionMatrixKPCA(dim) Y_train = np.dot(Y, P) if pca=='kpca' else np.dot(Y,P.transpose()) regressors = [] for i in range(dim): print 'at regressor number: ', i reg = Ridge() if regressor=='ridge' else SVR() y = [x[i] for x in Y_train] reg.fit(X, y) regressors.append(reg) Z_pred = [] for reg in regressors: Z_pred.append(reg.predict(X_test)) print 'prediction shapes:' , len(Z_pred), len(Z_pred[0]) Z_pred = np.array(Z_pred) Y_pred = np.dot(P, Z_pred).transpose() if pca=='kpca' else np.dot(Z_pred.transpose(), P) return model, regressors, Y_pred
def ridge_regression(data,target,alphas): plt.figure() mean_rmses=[] kf=KFold(len(target),10,True,None) for alpha0 in alphas: rmses=[] clf=Ridge(alpha=alpha0,normalize=True,solver='svd') for train_index, test_index in kf: data_train,data_test=data[train_index],data[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) mean_rmses.append(np.mean(rmses)) x0=np.arange(1,11) plt.plot(x0,rmses,label='alpha='+str(alpha0),marker='o') lr = linear_model.LinearRegression(normalize = True) rmses = [] for train_index, test_index in kf: data_train, data_test = data[train_index], data[test_index] target_train, target_test = target[train_index], target[test_index] lr.fit(data_train, target_train) rmse = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2)) rmses.append(rmse) mean_rmses.append(np.mean(rmses)) x0=np.arange(1,11) plt.plot(x0,rmses,label='linear',marker='*') plt.title("RMSE comparison between different alpha values of Ridge regularization") plt.legend() plt.show() # print(mean_rmses) return mean_rmses
class LogisticRegressionSeparator(BaseEstimator): def get_params(self, deep=True): return {} def fit(self, X, y): # lets predict which users will spend anything later classes = y - X[:, 0] classes = np.where(classes > 0.1, 1, 0) self.classifier = LogisticRegression( class_weight='balanced') self.classifier.fit(X, classes) results = self.classifier.predict(X) results = results == 1 self.estimator = Ridge(alpha=0.05) self.estimator.fit(X[results], y[results]) def predict(self, X): y = X[:,0].reshape(X.shape[0]) labels = (self.classifier.predict(X) == 1) y[labels] = self.estimator.predict(X[labels]) return y
def regression_NumMosquitos(Xtr, ytr, Xte): from sklearn.linear_model import Ridge, RidgeCV #model_nm = RidgeCV(alphas=range(200, 401, 10), cv=5) model_nm = Ridge(alpha = 340) model_nm = model_nm.fit(Xtr, ytr) results_nm = model_nm.predict(Xte) return results_nm
def ridgeRegression(X,y): print("\n### ~~~~~~~~~~~~~~~~~~~~ ###") print("Ridge Regression") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myDegree = 40 polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False) Xp = polynomialFeatures.fit_transform(X) myScaler = StandardScaler() scaled_Xp = myScaler.fit_transform(Xp) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ridgeRegression = Ridge(alpha=1e-11,solver="cholesky") ridgeRegression.fit(scaled_Xp,y) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dummyX = np.arange(0,2,0.01) dummyX = dummyX.reshape((dummyX.shape[0],1)) dummyXp = polynomialFeatures.fit_transform(dummyX) scaled_dummyXp = myScaler.transform(dummyXp) dummyY = ridgeRegression.predict(scaled_dummyXp) outputFILE = 'plot-ridgeRegression.png' fig, ax = plt.subplots() fig.set_size_inches(h = 6.0, w = 10.0) ax.axis([0,2,0,15]) ax.scatter(X,y,color="black",s=10.0) ax.plot(dummyX, dummyY, color='red', linewidth=1.5) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def _make_forecast(self, model, name, alpha=None, l1_ratio=None): """ Output: DataFrame Train on the holdout set and make predictions for the next week """ X_hold = self.hold_set[self.hold_set.columns[1:]] if 'lyft' in self.filename: y_hold = self.hold_set['avg_est_price'] else: y_hold = self.hold_set['avg_price_est'] if name.split("_")[0] == "ridgecv": model = Ridge(alpha=alpha) elif name.split("_")[0] == "lassocv": model = Lasso(alpha=alpha) elif name.split("_")[0] == "elasticnetcv": model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) model.fit(X_hold, y_hold) self.X_forecast = X_hold.copy() # assumes weekofyear is increasing self.X_forecast['weekofyear'] = self.X_forecast['weekofyear'].apply(lambda x: x+1) self.X_forecast.index = self.X_forecast.index + pd.Timedelta(days=7) self.y_forecast = model.predict(self.X_forecast) self.y_forecast = pd.DataFrame(self.y_forecast, index=self.X_forecast.index, columns=['y_forecast']) self.y_forecast = pd.concat([self.X_forecast, self.y_forecast], axis=1) saved_filename = "rideshare_app/data/{}_forecast.csv".format(name) self.y_forecast.to_csv(saved_filename) print "saved prediction values to {}".format(saved_filename)
def ridge_regression(train_x, train_y, pred_x, review_id, v_curve=False, l_curve=False, get_model=True): """ :param train_x: train :param train_y: text :param pred_x: test set to predict :param review_id: takes in a review id :param v_curve: run the model for validation curve :param l_curve: run the model for learning curve :param get_model: run the model :return:the predicted values,learning curve, validation curve """ lin = Ridge(alpha=0.5) if get_model: print "Fitting Ridge..." lin.fit(train_x, np.log(train_y+1)) gbr_pred = np.exp(lin.predict(pred_x))- 1 for i in range(len(gbr_pred)): if gbr_pred[i] < 0: gbr_pred[i] = 0 Votes = gbr_pred[:, np.newaxis] Id = np.array(review_id)[:, np.newaxis] submission_lin= np.concatenate((Id,Votes),axis=1) np.savetxt("submission_ridge.csv", submission_lin,header="Id,Votes", delimiter=',',fmt="%s, %0.2f", comments='') if v_curve: print "Working on Validation Curves" plot_validation_curve(Ridge(), "Validation Curve for Ridge Regression", train_x, np.log(train_y+1.0), param_name="alpha", param_range=[0.1,0.2,0.5,1,10]) if l_curve: print "Working on Learning Curves" plot_learning_curve(Ridge(), "Learning Curve for Linear Regression", train_x, np.log(train_y+1.0))
def kfold_cv(X_train, y_train,idx,k): kf = StratifiedKFold(y_train,n_folds=k) xx=[] count=0 for train_index, test_index in kf: count+=1 X_train_cv, X_test_cv = X_train[train_index,:],X_train[test_index,:] gc.collect() y_train_cv, y_test_cv = y_train[train_index],y_train[test_index] y_pred=np.zeros(X_test_cv.shape[0]) m=0 for j in range(m): clf=xgb_classifier(eta=0.05,min_child_weight=20,col=0.5,subsample=0.7,depth=5,num_round=500,seed=j*77,gamma=0.1) y_pred+=clf.train_predict(X_train_cv,(y_train_cv),X_test_cv,y_test=(y_test_cv)) yqq=y_pred*(1.0/(j+1)) print j,llfun(y_test_cv,yqq) #y_pred/=m; clf=Ridge()#RandomForestClassifier(n_jobs=-1,n_estimators=100,max_depth=100) clf.fit(X_train_cv,(y_train_cv)) y_pred=clf.predict(X_test_cv) print y_pred.shape xx.append(llfun(y_test_cv,(y_pred))) ypred=y_pred yreal=y_test_cv idx=idx[test_index] print xx[-1]#,y_pred.shape break print xx,'average:',np.mean(xx),'std',np.std(xx) return ypred,yreal,idx#np.mean(xx)
def knn_twice(k): knn1 = neighbors.KNeighborsRegressor(n_neighbors=k) knn1.fit(trainf,trainlab) print 'here' tim = time.time(); n = len(train)/1000 pred1 = [] for i in range(0,n): pred1.extend(knn1.predict(trainf[(i*1000):((i+1)*(1000))])) print(i) pred1.extend(knn1.predict(trainf[67000:67946])) print "time: " + str(time.time() - tim) #knn = neighbors.KNeighborsRegressor(n_neighbors=k) #knn.fit(pred1,trainlab) ridge = Ridge(alpha=1.0) ridge.fit(pred1, trainlab) n = 10 pred2 = [] for i in range(0,n): pred2.extend(knn1.predict(testf[(i*1000):((i+1)*(1000))].toarray())) print(i) n = 10 pred = [] for i in range(0,n): pred.extend(ridge.predict(pred2[(i*1000):((i+1)*(1000))])) print(i) #RMSE: testlab = np.array(test.ix[:,4:]) err = format(np.sqrt(np.sum(np.array(np.array(pred-testlab)**2)/ (testf.shape[0]*24.0)))) return err
def cross_valid(X,Y,n_fold): clf = Ridge(alpha=1.0) total_mean_square = 0 total_coef = 0 Y_np = np.array(Y) n_samples, n_features = len(X), len(X[0]) kf_Y = cross_validation.KFold(n_samples, n_fold) index = [] preds = [] truths = [] for train_index, test_index in kf_Y: X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y_np[train_index], Y_np[test_index] clf.fit(X_train,y_train) y_pred = clf.predict(X_test) index += test_index.tolist() preds += map(lambda x: 1 if x > 0.5 else 0 ,y_pred.tolist()) truths += y_test.tolist() #print "predict:",map(lambda x: 1 if x > 0.5 else 0,y_pred) #print "original:",y_test total_mean_square += mean_squared_error(y_test,y_pred) total_coef += clf.coef_ #print 'Coefficient of the prediction (pearsonr): ' , pearsonr(y_pred,y_test) print 'All Coefficient of the prediction (pearsonr): ' , pearsonr(truths,preds) print 'Average mean squared error is: ' , total_mean_square / n_fold diff_count = sum([abs(truth - pred) for truth, pred in zip(truths, preds)]) acc = 100-1.* diff_count/len(truths)*100 print 'prediction accuracy is %f'%(acc) return [total_coef, index , preds]
def bowFitAndPrediction(predictData, textSeries, outcome,typeModel='binary'): print "Bag of words for %s" % (textSeries.name) if typeModel == 'continuous': bowModel = Ridge(alpha = 0.001) else: bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=1, intercept_scaling=1, class_weight=None, random_state=423) vectorizer = getFeatures(textSeries) X_train = vectorizer.transform(predictData) #Outcomes Y_train = outcome #Logistic regression, not sure if best bowModel.fit(X_train,Y_train) #Comment out later, fitting on CV data if typeModel == 'continuous': predict = bowModel.predict(X_train) yhat = predict else: predict = bowModel.predict_proba(X_train) yhat = predict[:,1] return (yhat, vectorizer, bowModel)
def impute_age(): X, P = gfa.platform_expression("GPL96") model = impute.KNNImputer() Xi = model.fit_transform(X, axis=1) age = array(P["age"].tolist()) Xm = Xi.as_matrix() ix = array((age >= 10) & (age <= 120)).nonzero()[0] np.random.shuffle(ix) Xm = Xm[ix, :] age = age[ix] n_train = 2000 n_test = 500 # clf = SVR(C=1e-5, epsilon=1) # clf = LinearRegression() clf = Ridge() # clf = SimpleRegressor() # clf = Lasso() clf.fit(Xm[:n_train, :], age[:n_train]) y = age[n_train : (n_train + n_test)] y_hat = clf.predict(Xm[n_train : (n_train + n_test)]) dy = y - y_hat bias_tr = y_hat.mean() - age.mean() print("\nBias (vs train):\t\t", bias_tr) print("Bias (vs test):\t\t\t", dy.mean()) print("Mean error:\t\t\t", fabs(dy).mean()) print("Mean error (bias corrected):\t", fabs(dy - bias_tr).mean()) print("MSE:\t\t\t\t", np.power(dy, 2).mean())
def RidgeRegression(self,filename,outputFile): pheno,geno = self.inputParse(filename) for row in geno: if len(row)%2 !=0: return "Rows are not even." maxGeno = max(geno) allGeno = list(set(maxGeno)) encoder = [i for i in range(len(allGeno))] lengthGeno = len(geno) length = len(geno) lenInnerGeno = len(geno[0]) genoMake = [0 for x in range(len(allGeno))] dictionary = dict(zip(allGeno,encoder)) for i in range(length): for x in range(lenInnerGeno): geno[i][x] = dictionary[geno[i][x]] phenoNaN = [] for i in range(len(pheno)): if pheno[i] == 'NaN': phenoNaN.append(i) phenoNaN.reverse() for i in phenoNaN: del pheno[i] genoMiss = [] for i in range(len(geno)): if i not in phenoNaN: genoMiss.append(geno[i]) pheno = [float(i) for i in pheno] alpha = self.alphaOptimization(genoMiss,pheno) clf = Ridge(alpha = alpha) clf.fit(genoMiss,pheno) predicted = clf.predict(geno) predicted = np.transpose(predicted) np.savetxt(outputFile,np.transpose(predicted))
def ridge_regressor(df): """ INPUT: Pandas dataframe OUTPUT: R^2 and Mean Absolute Error performance metrics, feature coefficients """ y = df.pop("price").values X = df.values feature_names = df.columns xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0) clf = Ridge(alpha=1.0) clf.fit(xtrain, ytrain) score = clf.score(xtest, ytest) feat_imps = clf.coef_ ypredict = clf.predict(xtest) mae = np.mean(np.absolute(ytest - ypredict)) mae_percent = np.mean(np.absolute(ytest - ypredict) / ytest) return ( "R^2 is ", score, "RMSE is ", rmse, "MAE percent is ", mae_percent, "Feature coefficients are ", zip(feature_names, feat_imps), )
def _check_ridge_model(featureses, labels): """Plot ridge regression predictions""" for tfidf_count in FEATURES_SIZES: test_points = [] for i in range(16): tmp = [i, 100] tmptmp = [0] * tfidf_count if tmptmp: tmp.extend(tmptmp) test_points.append(tmp) test_points = np.array(test_points) limit = tfidf_count + 2 model = Ridge() model.fit(featureses[:, :limit], labels) predictions = model.predict(test_points) plt.plot( predictions, label=str(tfidf_count), linestyle=next(LINECYCLER), linewidth=3) # plt.text(test_points[-1, 0], predictions[-1], str(tfidf_count)) plt.legend() plt.xlabel('Document order') plt.ylabel('Time (seconds)') plt.savefig('ridge_predictions.pdf')
def traverse_movies_ridge(): LBMAP = getLBMap() DMAP = createEmpty() P_ERRORS, ERRORS = [], [] training_data, training_response = [], [] for i in range(len(data)): movie = data[i] m_rev = movie['revenue'] myvector = vectorizeMovie(movie, LBMAP, DMAP) if i > 100: model = Ridge(alpha = .5) model.fit(training_data, training_response) raw = math.fabs(model.predict(myvector) - m_rev) ERRORS.append(raw) #P_ERRORS.append(round(raw/m_rev, 4)) training_data.append(myvector) training_response.append(m_rev) DMAP = update(movie, DMAP) #print 'all', avg_float_list(P_ERRORS) print 'all', avg_float_list(ERRORS)
def reg_skl_ridge(param, data): [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data ridge = Ridge(alpha=param["alpha"], normalize=True) ridge.fit(X_tr, y_reg_tr) pred = ridge.predict(X_cv) RMSEScore = getscoreRMSE(y_reg_cv, pred) return RMSEScore, pred
predictions = results.predict(X_test) MAPE_linear.append(1/a * sum(abs((y_test - predictions)/y_test))) print("Linear regression: " + str(np.mean(MAPE_linear))) #higher cross validation MAPE even with high R square might be a case of overfitting #cross validation - ridge regression MAPE_ridge = [] #mean absolute percentage error cv = KFold(n_splits = 10, shuffle=True) for train_index, test_index in cv.split(X, y): a = len(test_index) X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] fit_ridge = Ridge(alpha=30) fit_ridge.fit(X_train, y_train) predictions = fit_ridge.predict(X_test) MAPE_ridge.append(1/a * sum(abs((y_test - predictions)/y_test))) print("Ridge regression: " + str(np.mean(MAPE_ridge))) #Cross Validation - spline regression MAPE_spline = [] #mean absolute percentage error cv = KFold(n_splits = 10, shuffle=True) for train_index, test_index in cv.split(X, y): n = len(test_index) seconds_train, seconds_test = seconds.iloc[train_index], seconds.iloc[test_index] followers_train, followers_test = followers.iloc[train_index], followers.iloc[test_index] dummies_train, dummies_test = dummies.iloc[train_index], dummies.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] X_train = spline_transform(seconds_train, followers_train, dummies_train) results = sm.OLS(y_train, X_train).fit()
ax[1].set_xlabel("target") ax[1].set_ylabel("predict") ax[1].scatter(t, th3, c='g', s=3) plt.savefig("Linear_regression_advance.jpg") # Tikhanov (quadratic) Regularizer gamma = 0.2 wR = np.linalg.inv(X2.T @ X2 + gamma * np.identity(NumFeatures + 1)) @ X2.T @ t l1 = Lasso(alpha=0.2) l1.fit(X, t) th_lasso = l1.predict(X) print(' L1 Reg:{:.3f}'.format(error(t, th_lasso))) l2 = Ridge(alpha=0.2) l2.fit(X, t) th_ridge = l2.predict(X) print(' L2 Reg:{:.3f}'.format(error(t, th_ridge))) fig, ax = plt.subplots(nrows=2, ncols=1, figsize=(16, 16)) ax[0].bar(np.arange(len(wR)), wR) # Tikhanov (quadratic) Regularizer ax[1].bar(np.arange(len(l2.coef_)), l2.coef_) # Ridge ax[0].set_ylim(-900, 900) ax[1].set_ylim(-900, 900) ax[0].set_title("Tikhanov (quadratic) Regularizer") ax[1].set_title("Ridge regularizer") plt.savefig("compare L2 regularizer.jpg") fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 10)) ax[0].bar(np.arange(len(w)), w) # Pseudo-increase solution to linear regression ax[1].bar(np.arange(len(l1.coef_)), l1.coef_) # Lasso
#lin_reg = LinearRegression() #plot_learning_curve(lin_reg, X, y) from sklearn.pipeline import Pipeline polynomial_regression = Pipeline([ ("poly_features", PolynomialFeatures(degree=10, include_bias=False)), ("sgd_reg", LinearRegression()), ]) #plot_learning_curve(polynomial_regression, X, y) # 1.5 Regulation: Ridge(l2) # Ridge (l2) from sklearn.linear_model import Ridge ridge_reg = Ridge(alpha=1, solver="cholesky") # Ridge is square ridge_reg.fit(X, y) print(ridge_reg.predict([[1.5]])) sgd_reg = SGDRegressor(penalty="l2") #"l2" is Ridge regulation sgd_reg.fit(X, y.ravel()) #SGDRegressor's y is one dimension: use .ravel() # Lasso regulation (l1) from sklearn.linear_model import Lasso lasso_reg = Lasso(alpha=0.1) sgd_reg = SGDRegressor(penalty="l1") #"l1" is Lasso; first order lasso_reg.fit(X, y) lasso_reg.predict([[1.5]]) # Elastic Net (l2+l1) from sklearn.linear_model import ElasticNet elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5) elastic_net.fit(X, y) elastic_net.predict([[1.5]])
# In[17]: from sklearn.linear_model import Ridge model = Ridge(alpha=1, normalize=True) model.fit(x_train, y_train) # In[18]: print('Training_score : ', model.score(x_train, y_train)) # In[19]: y_pred = model.predict(x_test) # In[ ]: df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test}) # In[ ]: df_pred_actual.head(50) # In[ ]: df_pred_actual.to_csv('Results.csv') # In[22]:
validation_data=(X_test, y_test), batch_size=128, epochs=100, verbose=1) y_preds = model.predict(X_test) print(r2_score(y_test, y_preds)) # In[50]: from sklearn.metrics import mean_squared_error from math import sqrt from sklearn.linear_model import Ridge regressor = Ridge(alpha=50, max_iter=10000) regressor.fit(X_train, y_train) y_preds = regressor.predict(X_test) print(r2_score(y_test, y_preds)) print(sqrt(mean_squared_error(y_test, y_preds))) # In[37]: # model = build_model(X_train.shape[1]) # model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=16, epochs=100, verbose=1) # # y_preds = model.predict(X_test) # print(r2_score(y_test, y_preds)) # In[38]: sqrt(mean_squared_error(y_test, y_preds))
def NARMA_Test(test_length=800, train_length=800, num_loops=1, a=0, plot=True, N=400, eta=0.4, gamma=0.05, phi=np.pi / 6, tau=400, bits=8, preload=False): """ Args: test_length: length of testing data train_length: length of training data num_loops: number of delay loops in reservoir a: ridge regression parameter N: number of virtual nodes plot: display calculated time series gamma: input gain eta: oscillation strength phi: phase of MZN r: loop delay length bits: bit precision preload: preload mask and time-series data Returns: NRMSE: Normalized Root Mean Square Error """ #Import u and m if preload: file1 = open("data/Input_sequence.txt", "r") file2 = open("data/mask_2.txt", "r") contents = file1.readlines() contents2 = file2.readlines() u = [] m = [] for i in range(1000): u.append(float(contents[i][0:contents[i].find("\t")])) if i < 400: m.append(float(contents2[i][0:contents2[i].find("\n")])) file1.close() file2.close() u = np.array(u) m = np.array(m) #Randomly initialize u and m else: u = np.random.rand(train_length + test_length) / 2. m = np.array( [random.choice([-0.1, 0.1]) for i in range(N // num_loops)]) #Calculate NARMA10 target target = NARMA_Generator(len(u), u) #Instantiate Reservoir, feed in training and verification datasets r1 = DelayReservoir(N=N // num_loops, eta=eta, gamma=gamma, theta=0.2, loops=num_loops, phi=phi) x = r1.calculateMZNBit(u[:train_length], m, bits) #x_ideal = r1.calculateMZN(u[:train_length],m) x_test = r1.calculateMZNBit(u[train_length:], m, bits) #x_test_ideal = r1.calculateMZN(u[train_length:],m) #Train using Ridge Regression #clf = RidgeCV(alphas = a,fit_intercept = True) clf = Ridge(alpha=a) clf.fit(x, target[:train_length]) y_test = clf.predict(x_test) y_input = clf.predict(x) #Calculate NRMSE NRMSE = np.sqrt(np.mean(np.square(y_test[50:]-target[train_length+50:]))/\ np.var(target[train_length+50:])) NRMSEi = np.sqrt(np.mean(np.square(y_input-target[:train_length]))/\ np.var(target[:train_length])) #Write to File ''' x_total = np.concatenate((x,x_test)) x_total = x_total.flatten(order='C') file1 = open("data/64_bit_test_x.txt","w+") file2 = open("data/64_bit_test_y.txt","w+") for i in range(2*320000): file1.write("%f"%x_total[i]+"\n") if(i < 1600): file2.write("%f"%target[i]+"\n") file1.close() ''' #Plot predicted Time Series if (plot == True): #fig, (ax1,ax2) = plt.subplots(2,1) #ax1.plot(x.flatten()[5000:]) #ax2.plot(x_ideal.flatten()[5000:]) #plt.plot(x.flatten()[:1200]) plt.plot(y_test[50:], label='Prediction') plt.plot(target[train_length + 50:], label='Target') plt.title('NRMSE = %f' % NRMSE) plt.legend() plt.show() return NRMSE
# Замените пропуски в столбцах LocationNormalized и ContractTime на специальную строку 'nan'. train['LocationNormalized'].fillna('nan', inplace=True) train['ContractTime'].fillna('nan', inplace=True) # Примените DictVectorizer для получения one-hot-кодирования признаков LocationNormalized и ContractTime. enc = DictVectorizer() X_train_cat = enc.fit_transform(train[['LocationNormalized', 'ContractTime']].to_dict('records')) # Объедините все полученные признаки в одну матрицу "объекты-признаки". Обратите внимание, что матрицы для текстов и # категориальных признаков являются разреженными. Для объединения их столбцов нужно воспользоваться функцией # scipy.sparse.hstack. X_train = hstack([X_train_text, X_train_cat]) # 3. Обучите гребневую регрессию с параметром alpha=1. Целевая переменная записана в столбце SalaryNormalized. y_train = train['SalaryNormalized'] model = Ridge(alpha=1) model.fit(X_train, y_train) # 4. Постройте прогнозы для двух примеров из файла salary-test-mini.csv. Значения полученных прогнозов являются # ответом на задание. Укажите их через пробел. test = pandas.read_csv('../source/salary-test-mini.csv') X_test_text = vec.transform(text_transform(test['FullDescription'])) X_test_cat = enc.transform(test[['LocationNormalized', 'ContractTime']].to_dict('records')) X_test = hstack([X_test_text, X_test_cat]) y_test = model.predict(X_test) print(1, '{:0.2f} {:0.2f}'.format(y_test[0], y_test[1]))
for power in range(2,degree+1): expanded[:,power-1]=data[:,0]**power return expanded data = expand(data, 10) train, temp = random_split(data, len(data)/2) valid, test = random_split(temp, len(temp)/2) lambs = np.linspace(0.01,0.2) bestYs = [] result = [] best_err = 10000000 # very large number for lamb in lambs: solver = Ridge(alpha = lamb, solver='cholesky',tol=0.00001) solver.fit(train[:,:-1],train[:,-1]) ys = solver.predict(valid[:,:-1]) valid_err = np.mean((ys-valid[:,-1])**2) result.append([lamb, valid_err]) if valid_err<best_err: # keep the best bestYs = ys result = np.array(result) plt.plot(result[:,0], result[:,1], 'o') plt.show() ''' means = np.mean(Xs, 0) stdevs = np.std(Xs, 0)
day_to_predict = datetime.date(year=year, month=month, day=day) precipitation_intensity = row["Precipitation Intensity"] precipitation_probability = row["Precipitation Probability"] dew_point = row["Dew Point"] highest_temp = row["Highest Temp"] lowest_temp = row["Lowest Temp"] humidity = row['Humidity'] uv_index = row['UV Index'] prediction_values = numpy.array([[year, month, day, precipitation_intensity, precipitation_probability, dew_point, highest_temp, lowest_temp, humidity, uv_index]]) prediction = Ridge.predict(ridge, prediction_values) write_string = str(prediction).strip("[]") prediction_list.append(prediction) #Predictions are outputted with brackets around the number which is a nuisance when # trying to graph the data in excel so I remove them before writing to the csv filtered_data.loc[index, "Predicted Generation [kWh]"] = str(prediction).strip("[]") filtered_data.loc[index, "Date"] = day_to_predict write_string = write_string.strip("[]") # applies the filter to our graphs to remove the noise filtered_predictions = filter(a, b, prediction_list, axis=0) filtered_actual = filter(a, b, data["Generation [kWh]"], axis=0) filtered_data["Filtered Predictions"] = filtered_predictions filtered_data["Filtered Actual"] = filtered_actual
for bool, feature in zip(mask, df.columns[1:].tolist()): if bool: new_features.append(feature) #print(new_features) features.value = new_features stats.text = "Top 5 features according to Select K Best (Chi2) : " + str(new_features) ''' #print(new_features) x_train_original,x_test_original,y_train_original,y_test_original=train_test_split(df1,y,test_size=0.25) clf = Ridge() clf.fit(x_train_original,y_train_original) predictions=clf.predict(x_test_original) scores = cross_val_score(clf,df1,y,cv=5,scoring='neg_mean_squared_error') stats2.text += "Mean Squared Error: %.2f" % mean_squared_error(y_test_original, predictions) + '</br>' stats2.text += " Variance score: %.2f" % r2_score(y_test_original, predictions) + '</br>' stats2.text += " Cross Validation score: %.2f " % scores.mean() ''' p1 = figure(plot_height=350,title="PR Curve") p1.x_range = Range1d(0,1) p1.y_range = Range1d(0,1) p1.line([0],[0],name ="line2") tab1 = Panel(child=p1, title="PR Curve") tabs = Tabs(tabs=[ tab1 ]) '''
# print(np.shape(coefs)) # ax = plt.gca() # ax.plot(grid, coefs) # ax.set_xscale('log') # plt.axis('tight') # plt.xlabel('alpha') # plt.ylabel('weights') # plt.show() # Split data into 50/50 train/test X_train, X_test , y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=1) # Fit a ridge regression model with lambda 4 ridge4 = Ridge(alpha = 4, normalize = True) ridge4.fit(X_train, y_train) # Fit a ridge regression on the training data pred = ridge4.predict(X_test) # Use this model to predict the test data # print(pd.Series(ridge4.coef_, index = X.columns)) # Print coefficients print("MSE alpha 4: ", round(mean_squared_error(y_test, pred),2)) # Calculate the test MSE # Fit a ridge regression model with lambda 10^10 ridge1010 = Ridge(alpha = 10**10, normalize = True) ridge1010.fit(X_train, y_train) pred = ridge1010.predict(X_test) # print(pd.Series(ridge1010.coef_, index = X.columns)) print("MSE alpha 10^10: ", round(mean_squared_error(y_test, pred),2)) # Fit a ridge regression model with lambda 0 (Which is equivalent to least squares) ridge = Ridge(alpha = 0, normalize = True) ridge.fit(X_train, y_train) pred = ridge.predict(X_test) # print(pd.Series(ridge.coef_, index = X.columns))
class RidgeClass: """ Name : Ridge Attribute : None Method : predict, predict_by_cv, save_model """ def __init__(self): # 알고리즘 이름 self._name = 'ridge' # 기본 경로 self._f_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir)) # 경고 메시지 삭제 warnings.filterwarnings('ignore') # 원본 데이터 로드 data = pd.read_csv(self._f_path + "/regression/resource/regression_sample.csv", sep=",", encoding="utf-8") # 학습 및 테스트 데이터 분리 self._x = (data["year"] <= 2017) self._y = (data["year"] >= 2018) # 학습 데이터 분리 self._x_train, self._y_train = self.preprocessing(data[self._x]) # 테스트 데이터 분리 self._x_test, self._y_test = self.preprocessing(data[self._y]) # 모델 선언 self._model = Ridge(alpha=0.5) # 모델 학습 self._model.fit(self._x_train, self._y_train) # 데이터 전처리 def preprocessing(self, data): # 학습 x = [] # 레이블 y = [] # 기준점(7일) base_interval = 7 # 기온 temps = list(data["temperature"]) for i in range(len(temps)): if i < base_interval: continue y.append(temps[i]) xa = [] for p in range(base_interval): d = i + p - base_interval xa.append(temps[d]) x.append(xa) return x, y # 일반 예측 def predict(self, save_img=False, show_chart=False): # 예측 y_pred = self._model.predict(self._x_test) # 스코어 정보 score = r2_score(self._y_test, y_pred) # 리포트 확인 if hasattr(self._model, 'coef_') and hasattr(self._model, 'intercept_'): print(f'Coef = {self._model.coef_}') print(f'intercept = {self._model.intercept_}') print(f'Score = {score}') # 이미지 저장 여부 if save_img: self.save_chart_image(y_pred, show_chart) # 예측 값 & 스코어 return [list(y_pred), score] # CV 예측(Cross Validation) def predict_by_cv(self): # Regression 알고리즘은 실 프로젝트 상황에 맞게 Cross Validation 구현 return False # GridSearchCV 예측 def predict_by_gs(self): pass # 모델 저장 및 갱신 def save_model(self, renew=False): # 모델 저장 if not renew: # 처음 저장 joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') else: # 기존 모델 대체 if os.path.isfile(self._f_path + f'/model/{self._name}_rg.pkl'): os.rename(self._f_path + f'/model/{self._name}_rg.pkl', self._f_path + f'/model/{str(self._name) + str(time.time())}_rg.pkl') joblib.dump(self._model, self._f_path + f'/model/{self._name}_rg.pkl') # 회귀 차트 저장 def save_chart_image(self, data, show_chart): # 사이즈 plt.figure(figsize=(15, 10), dpi=100) # 레이블 plt.plot(self._y_test, c='r') # 예측 값 plt.plot(data, c='b') # 이미지로 저장 plt.savefig('./chart_images/tenki-kion-lr.png') # 차트 확인(Optional) if show_chart: plt.show() def __del__(self): del self._x_train, self._x_test, self._y_train, self._y_test, self._x, self._y, self._model
這兩個迴歸帶有Regularization正規化效果 ElasticNet是LASSO跟Ridge的結合 ''' #Ridge訓練(需手動設定參數) from sklearn.linear_model import Ridge Ridge_regressor = Ridge(alpha=1.0) Ridge_regressor.fit(X_train, Y) #RidgeCV訓練(透過CV挑選參數) from sklearn.linear_model import RidgeCV Ridge_regressor = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1) Ridge_regressor.fit(X_train, Y) #Ridge預測 y_pred = Ridge_regressor.predict(X_test) #ElasticNet訓練(需手動設定參數) from sklearn.linear_model import ElasticNet ElasticNet_regressor = ElasticNet(alpha=1.0, l1_ratio=0.5) ElasticNet_regressor.fit(X_train, Y) #ElasticNetCV訓練(透過CV決定參數) from sklearn.linear_model import ElasticNetCV ElasticNet_regressor = ElasticNet(cv=5) ElasticNet_regressor.fit(X_train, Y) #ElasticNet預測 y_pred = ElasticNet_regressor.predict(X_test)
def NARMA_Test_Compare(test_length=200, train_length=800, num_loops=1, a=0, plot=True, N=400, eta=0.5, gamma=1, phi=np.pi / 4, r=1): """ Compare with pre-determined NARMA10 series Args: test_length: length of verification data train_length: length of training data num_loops: number of delay loops in reservoir a: list of ridge regression constants for hyperparameter tuning N: number of virtual nodes plot: display calculated time series gamma: input gain eta: oscillation strength phi: phase of MZN r: loop length ratio Returns: NRMSE: Normalized Root Mean Square Error """ #Import u and m file1 = open("data/uin_and_target.txt", "r") file2 = open("data/Mask.txt", "r") contents = file1.readlines() contents2 = file2.readlines() u = [] target = [] m = [] for i in range(1000): u.append(float(contents[i][0:contents[i].find("\t")])) target.append(float(contents[i][contents[i].find("\t"):])) if i < 400: m.append(float(contents2[i][0:contents2[i].find("\n")])) file1.close() file2.close() u = np.array(u) m = np.array(m) target = np.array(target) #Instantiate Reservoir, feed in training and verification datasets r1 = DelayReservoir(N=N // num_loops, eta=eta, gamma=gamma, theta=0.2, loops=num_loops, phi=phi) x = r1.calculateMZN(u[:train_length], m) x_test = r1.calculateMZN(u[train_length:], m) x = [] file3 = open("data/X_node.txt", "r") contents3 = file3.readlines() print(len(contents3)) for i in range(400000): x.append(float(contents3[i][:contents3[i].find("\n")])) x = np.array(x) x = x.reshape((-1, 1)) x = x.reshape((1000, 400)) #Train using Ridge Regression clf = Ridge(alpha=a, fit_intercept=True) clf.fit(x[:800], target[:train_length]) w = clf.coef_ y_train = x @ w y_test = clf.predict(x[800:]) #Write to file x_total = np.concatenate((x, x_test)) x_total = x_total.flatten(order='C') file3 = open("data/y_train2.txt", "w+") for i in range(800): file3.write("%f" % y_train[i] + "\n") file3.close() #Calculate NRMSE NRMSE = np.sqrt(np.mean(np.square(y_test[50:]-target[train_length+50:]))/\ np.var(target[train_length+50:])) #Plot predicted Time Series if (plot == True): plt.plot(y_test[50:], label='Prediction') plt.plot(target[train_length + 50:], label='Target') plt.title('NRMSE = %f' % NRMSE) plt.legend() plt.show() return NRMSE
BINS[i] = NUM * i #Loading data X, Y = Loader.data_load(CLASS, PARTS, PATH) #Using colour histograms if needed X = Loader.histogram(X, BINS, NUM) #Reshape and polynomize if needed if FL: X = Loader.preproc(X, normalize=False, reshape=True)[0] print('DO ', X.shape) X = Loader.polynom(X, GRADE) print('POSLE', X.shape) X, TRAIN_IND, TEST_IND = Loader.preproc(X, reshape=False) #preprocessing and data split if needed if not FL: X, TRAIN_IND, TEST_IND = Loader.preproc(X) print(X.shape) print(X[TRAIN_IND].shape, X[TEST_IND].shape) eval_set = [X[TEST_IND], Y[TEST_IND]] #The Ridge model = Ridge(alpha=ALPH, max_iter=ITER) model.fit(X[TRAIN_IND], Y[TRAIN_IND]) #prediction #X_TEST = X[TEST_IND] Y_P = model.predict(X[TEST_IND]) accuracy = score(Y[TEST_IND], Y_P.round()) print('TEST ACCURACY = ', accuracy * 100, '%')
y_pred_lr=model_lr.predict(x_test) get_performance(y_pred_lr) get_plot(y_pred_lr) get_performance(y_pred_lr) """# Ridge Regression""" model_ridge = Ridge() model_ridge.fit(x_train, y_train) #generate predictions y_pred_ridge=model_ridge.predict(x_test) get_performance(y_pred_ridge) get_plot(y_pred_ridge) """# Gradient Boosting Trees""" # Model #2 - Gradient Boosting Trees model_gb = GradientBoostingRegressor() model_gb.fit(x_train, y_train) # Infer y_pred_gb = model_gb.predict(x_test) get_performance(y_pred_gb)
class ExpectedRankRegression(ObjectRanker, Learner): def __init__(self, n_object_features, alpha=0.0, l1_ratio=0.5, tol=1e-4, normalize=True, fit_intercept=True, random_state=None, **kwargs): """ Create an expected rank regression model. This model normalizes the ranks to [0, 1] and treats them as regression target. For α = 0 we employ simple linear regression. For α > 0 the model becomes ridge regression (when l1_ratio = 0) or elastic net (when l1_ratio > 0). Parameters ---------- n_object_features : int Number of features of the object space alpha : float, optional Regularization strength l1_ratio : float, optional Ratio between pure L2 (=0) or pure L1 (=1) regularization. tol : float, optional Optimization tolerance normalize : bool, optional If True, the regressors will be normalized before fitting. fit_intercept : bool, optional If True, the linear model will also fit an intercept. random_state : int, RandomState instance or None, optional Seed of the pseudorandom generator or a RandomState instance **kwargs Keyword arguments for the algorithms References ---------- .. [1] Kamishima, T., Kazawa, H., & Akaho, S. (2005, November). "Supervised ordering-an empirical survey.", Fifth IEEE International Conference on Data Mining. """ self.normalize = normalize self.n_object_features = n_object_features self.alpha = alpha self.l1_ratio = l1_ratio self.tol = tol self.logger = logging.getLogger('ERR') self.fit_intercept = fit_intercept self.random_state = check_random_state(random_state) self.weights = None def fit(self, X, Y, **kwargs): self.logger.debug('Creating the Dataset') x_train, y_train = complete_linear_regression_dataset(X, Y) assert x_train.shape[1] == self.n_object_features self.logger.debug('Finished the Dataset') if self.alpha < 1e-3: self.model = LinearRegression(normalize=self.normalize, fit_intercept=self.fit_intercept) self.logger.info("LinearRegression") else: if self.l1_ratio >= 0.01: self.model = ElasticNet(alpha=self.alpha, l1_ratio=self.l1_ratio, normalize=self.normalize, tol=self.tol, fit_intercept=self.fit_intercept, random_state=self.random_state) self.logger.info("Elastic Net") else: self.model = Ridge(alpha=self.alpha, normalize=self.normalize, tol=self.tol, fit_intercept=self.fit_intercept, random_state=self.random_state) self.logger.info("Ridge") self.logger.debug('Finished Creating the model, now fitting started') self.model.fit(x_train, y_train) self.weights = self.model.coef_.flatten() if self.fit_intercept: self.weights = np.append(self.weights, self.model.intercept_) self.logger.debug('Fitting Complete') def _predict_scores_fixed(self, X, **kwargs): n_instances, n_objects, n_features = X.shape self.logger.info( "For Test instances {} objects {} features {}".format(*X.shape)) X1 = X.reshape(n_instances * n_objects, n_features) scores = n_objects - self.model.predict(X1) scores = scores.reshape(n_instances, n_objects) scores = normalize(scores) self.logger.info("Done predicting scores") return scores def predict_scores(self, X, **kwargs): return super().predict_scores(X, **kwargs) def predict_for_scores(self, scores, **kwargs): return ObjectRanker.predict_for_scores(self, scores, **kwargs) def predict(self, X, **kwargs): return super().predict(X, **kwargs) def clear_memory(self, **kwargs): pass def set_tunable_parameters(self, alpha=0.0, l1_ratio=0.5, tol=1e-4, **point): self.tol = tol self.alpha = alpha self.l1_ratio = l1_ratio if len(point) > 0: self.logger.warning('This ranking algorithm does not support' ' tunable parameters' ' called: {}'.format(print_dictionary(point)))
output_file = open('output.txt', 'w', encoding='ANSI') X_train, y_train = prepare_train_data_set() X_test = prepare_test_data_set() vectorizer = TfidfVectorizer(min_df=5) X_train_vector = coo_matrix( vectorizer.fit_transform(X_train.FullDescription)) X_test_vector = coo_matrix(vectorizer.transform(X_test.FullDescription)) X_train_category = coo_matrix( enc.fit_transform(X_train[['LocationNormalized', 'ContractTime']].to_dict('records'))) X_test_category = coo_matrix( enc.transform(X_test[['LocationNormalized', 'ContractTime']].to_dict('records'))) X_train_stack = hstack([X_train_vector, X_train_category]) X_test_stack = hstack([X_test_vector, X_test_category]) clf = Ridge(alpha=1) clf.fit(X_train_stack, y_train) a, b = clf.predict(X_test_stack) print(round(a, 2), round(b, 2), sep=' ', file=output_file) output_file.close()
def model_selection(self, train): """ Performs a test/train split on the training data. Gridsearches over three regularixation models (Lasso, Ridge, and ElasticNet), and fits a final model using the best performing model (Ridge) from the gridsearch stage. Returns the validation MSE and RMSE of the final model. Args: train: cleaned and scaled training data Returns: Validation MSE and RMSE of best performing gridsearched model. """ # Test/Train split training data y = train['SalePrice'] X = train.drop('SalePrice', axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y) # Gridsearch Lasso Model lasso = Lasso() param_list = {'alpha': np.linspace(.1, 1, 10)} lasso_grid = GridSearchCV(lasso, param_list, scoring='neg_mean_squared_error', cv=5) lasso_grid.fit(X_train, y_train) print('Model: {}, Best Params: {}, Best Score: {}'\ .format(lasso, lasso_grid.best_params_, abs(lasso_grid.best_score_))) # Gridsearch Ridge Model ridge = Ridge() param_list = { 'alpha': np.linspace(.1, 1, 10), 'solver': ['auto', 'svd', 'lsqr', 'cholesky'] } ridge_grid = GridSearchCV(ridge, param_list, scoring='neg_mean_squared_error', cv=5) ridge_grid.fit(X_train, y_train) print('Model: {}, Best Params: {}, Best Score: {}'\ .format(ridge, ridge_grid.best_params_, abs(ridge_grid.best_score_))) # Gridsearch ElasticNet Model elastic = ElasticNet() param_list = { 'alpha': np.linspace(0.5, 0.9, 20), 'l1_ratio': np.linspace(0.9, 1.0, 10) } elastic_grid = GridSearchCV(elastic, param_list, scoring='neg_mean_squared_error', cv=5) elastic_grid.fit(X_train, y_train) print('Model: {}, Best Params: {}, Best Score: {}'\ .format(elastic, elastic_grid.best_params_, abs(elastic_grid.best_score_))) # Best model on validation set of training data final_ridge = Ridge(alpha=1.0, solver='svd') final_ridge.fit(X_train, y_train) y_pred = final_ridge.predict(X_test) log_diff = np.log(y_pred + 1) - np.log(y_test + 1) score = np.sqrt(np.mean(log_diff**2)) print('Validation MSE Score: {}'.format( mean_squared_error(y_test, y_pred))) print('Validation RMSLE Score: {}'.format(score))
#spliting the dataset for training and testing from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target) #importing ridge model from sklearn.linear_model import Ridge ridge = Ridge() ridge.fit(X_train, y_train) # In[12]: pred_test = ridge.predict(X_test) pred_test # In[13]: ridge.score(X_test, y_test) # In[14]: #MSE from sklearn.metrics import mean_squared_error mean_squared_error(y_test, pred_test) # In[16]:
print("fold n°{}".format(fold_ + 1)) trn_data, trn_y = train.iloc[trn_idx][features], target.iloc[ trn_idx].values val_data, val_y = train.iloc[val_idx][features], target.iloc[ val_idx].values trn_data.fillna((trn_data.mean()), inplace=True) val_data.fillna((val_data.mean()), inplace=True) trn_data = trn_data.values val_data = val_data.values clf = Ridge(alpha=100) clf.fit(trn_data, trn_y) oof_ridge[val_idx] = clf.predict(val_data) predictions_ridge += clf.predict(tst_data) / folds.n_splits np.save('oof_ridge', oof_ridge) np.save('predictions_ridge', predictions_ridge) np.sqrt(mean_squared_error(target.values, oof_ridge)) # In[ ]: del tst_data gc.collect() # 3.78 CV is not bad, but it's far from what the best models can do in this competition. Let's take a look at a few non-linear models. We'll start with LightGBM. # In[ ]:
'Total', 'Precipitation', 'Date', 'Day', 'Brooklyn Bridge', 'Manhattan Bridge', 'Queensboro Bridge', 'Williamsburg Bridge' ]) y_train = df_new['Total'] #%% from sklearn import preprocessing from sklearn.linear_model import Ridge reg = Ridge(alpha=100) reg.fit(x_train, y_train) #%% reg.coef_ #%% from sklearn.metrics import r2_score, mean_squared_error y_pred = reg.predict(x_train) print(r2_score(y_train, y_pred)) print(mean_squared_error(y_train, y_pred)) #%% import yellowbrick res = y_train - y_pred #%% from yellowbrick.regressor import ResidualsPlot visualizer = ResidualsPlot(reg) visualizer.score(x_train, y_train) # Evaluate the model on the test data visualizer.poof() # Draw/show/poof the data
x_train = np.array([[0], [1], [2], [3], [4], [5]]) y_train = np.array([0, 1, 2, 3, 4, 5]) x_test = np.array([[6], [7], [8]]) y_test = np.array([6, 7, 8]) reg = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None) reg.fit(x_train, y_train) y_predict = reg.predict(x_test) print('Ridge_score: ', reg.score(x_train, y_train)) print('Ridge_coef_: ', reg.coef_) print('Ridge_intercept_: ', reg.intercept_) # Plot outputs plt.scatter(x_test, y_test, color='black') plt.plot(x_test, y_predict, color='blue', linewidth=3) plt.xticks(()) plt.yticks(()) plt.show()
# 岭回归 import numpy as np from sklearn.linear_model import Ridge from sklearn import model_selection import matplotlib.pyplot as plt from sklearn.preprocessing import PolynomialFeatures data = np.genfromtxt('Data/RidgeTestData01.txt', delimiter=',', skip_header=0) plt.plot(data[:, 4]) # plt.show() # data[:, :4] 指的是:所有行的前四列 X = data[:, :4] y = data[:, 4] poly = PolynomialFeatures(6) X = poly.fit_transform(X) train_set_X, test_set_X, train_set_y, test_set_y = model_selection.train_test_split( X, y, test_size=0.3, random_state=0) clf = Ridge(alpha=1.0, fit_intercept=True) clf.fit(train_set_X, train_set_y) clf.score(test_set_X, test_set_y) start = 1 # 接下来我们画一段200到300范围内的拟合曲线 end = 18 y_pre = clf.predict(X) # 是调用predict函数的拟合值 time = np.arange(start, end) plt.plot(time, y[start:end], 'b', label="real") plt.plot(time, y_pre[start:end], 'r', label='predict') # 展示真实数据(蓝色)以及拟合的曲线(红色) plt.legend(loc='upper lef') # 设置图例的位置 plt.show()
lr = LinearRegression() lasso = Lasso() ridge = Ridge() dtr = DecisionTreeRegressor() rfr = RandomForestRegressor(n_estimators=50) lr.fit(X_train, y_train) lasso.fit(X_train, y_train) ridge.fit(X_train, y_train) dtr.fit(X_train, y_train) rfr.fit(X_train, y_train) y_pred_lr = lr.predict(X_test) y_pred_lasso = lasso.predict(X_test) y_pred_ridge = ridge.predict(X_test) y_pred_dtr = dtr.predict(X_test) y_pred_rfr = rfr.predict(X_test) pred = lasso.predict(df[[ 'day', 'month', 'year', 'wickets in 1 to 6 1st innings', 'venue average runs in 1st innings', 'venue average wickets in 1st innings' ]]) act = df[['runs in 7 to 14 overs 1st innings']] dif = [] for i in range(len(pred)): k = ((abs(pred[i] - act['runs in 7 to 14 overs 1st innings'].iloc[i])) / act['runs in 7 to 14 overs 1st innings'].iloc[i]) dif.append(k * 100)
#Code starts here regressor = LinearRegression() score = cross_val_score(regressor, X_train, y_train, cv=10) mean_score = np.mean(score) print(mean_score) # -------------- from sklearn.linear_model import Lasso # Code starts here lasso = Lasso(random_state=0) lasso.fit(X_train, y_train) y_pred = lasso.predict(X_test) r2_lasso = r2_score(y_test, y_pred) # -------------- from sklearn.linear_model import Ridge # Code starts here ridge = Ridge(random_state=0) ridge.fit(X_train, y_train) y_pred = ridge.predict(X_test) r2_ridge = r2_score(y_test, y_pred)
dfExp = scaler.fit_transform(dfExp) X_train, X_test, y_train, y_test = train_test_split(dfExp, yT, test_size=0.33, random_state=42) ## based on the label value introduce more anomlous observations # xTrain = xTrain.join(y_train, lsuffix='_caller', rsuffix='_other') # print(xTrain) # print(y_train) clf = Ridge(alpha=1.0) clf.fit(X_train, y_train) y_pred = clf.predict(X_test) clf.score(X_test, y_test, sample_weight=None) list_of_lists = y_pred flattened = [val for sublist in list_of_lists for val in sublist] dfOut = dfOg dfOut = dfOut.drop([ 'travel_start_date', 'per_diem_based_on_rate', 'approval_date', 'taxi', 'travel_end_date', 'air_fare', 'mileage_based_on_rate', 'mileage', 'hotel', 'car_rental', 'per_diem' ], axis=1) dfOut['TotalExpensePred'] = pd.DataFrame(flattened) dfOut.to_excel('outputRidge.xlsx') dfLogistic = pd.read_excel('outputRidge.xlsx')
plt.plot(X, y, "b.", linewidth=3) plt.legend(loc='upper left') plt.xlabel('x') plt.axis([0, 3, 0, 4]) plt.figure() plt.subplot(121) plot_model(Ridge, polynomial=False, alphas=(0, 10, 100), random_state=42) plt.ylabel('y', rotation=0, fontsize=18) plt.subplot(122) plot_model(Ridge, polynomial=True, alphas=(0, 10**-5, 1), random_state=42) plt.show() #Now lets do Ridge Regression with scikit-learn ridge_reg = Ridge(alpha=1, solver="cholesky", random_state=42) ridge_reg.fit(X, y) ridge_reg.predict([[1.5]]) #We can also do this using SGD by adding a penalty hyperparameter=l2, meaning we want to add a regularization term to cost function sgd_reg = SGDRegressor(penalty='l2') sgd_reg.fit(X,y) sgd_reg.predict([[1.5]]) #----------------------------------------------------------LASSO REGRESSION--------------------------------------------------------- #Lasso stands for: Least Absolute Shrinkage and Selection Operator Regression. Very similar to Ridge reg but adds a regularization term #To the cost function but uses the l1 norm of the weight vector instead of half the square of the l2 norm used in Ridge #An important characteristic of lasso is that it tends to completely eliminate(set to zero) the weights(thetas) of least important features #Lets plot the same data as before but using Lasso models and smaller alpha instead of our Ridge model plt.figure() plt.subplot(121) plot_model(Lasso, polynomial=False, alphas=(0, 0.1, 1), random_state=42) plt.ylabel('y', rotation=0)
# Reading Data data = pd.read_csv('headbrain.csv') data.head() # Collecting X and Y X = data['Head Size(cm^3)'].values Y = data['Brain Weight(grams)'].values m = len(X) X = X.reshape((m, 1)) #X = [x[0] for x in X1] print(X) #input() # Model Intialization reg = Ridge(alpha=0.05, normalize=True) reg = Ridge() # Data Fitting reg = reg.fit(X, Y) # Y Prediction Y_pred = reg.predict(X) # Model Evaluation rmse = np.sqrt(mean_squared_error(Y, Y_pred)) r2 = reg.score(X, Y) print("RMSE") print(rmse) print("R2 Score") print(r2)
plt.grid() show_plot(X_train, 'Training Data') show_plot(X_test, 'Testing Data') from sklearn.linear_model import Ridge # Note that Ridge regression performs linear least squares with L2 regularization. # Create and train the Ridge Linear Regression Model regression_model = Ridge() regression_model.fit(X_train, y_train) lr_accuracy = regression_model.score(X_test, y_test) print("Linear Regression Score: ", lr_accuracy) predicted_prices = regression_model.predict(X) Predicted = [] for i in predicted_prices: Predicted.append(i[0]) close = [] for i in price_volume_target_scaled_df: close.append(i[0]) df_predicted = price_volume_target_df[['Date']] df_predicted['Close'] = close df_predicted['Prediction'] = Predicted interactive_plot(df_predicted, "Original Vs. Prediction") #LSTM Series model price_volume_df = individual_stock(stock_price_df, stock_vol_df, 'sp500')