def weight_analysis(verbose=0, stack_option='s'): logging.info('starting ensemble weight analysis') stack = STACK if stack_option == 's' else MODELS pool = multiprocessing.Pool(processes=4) drivers = settings.DRIVER_IDS#[:1000] CUTOFF = -1 results = pool.map( compute_weights, map(lambda x: (x, verbose, stack_option), drivers) ) predictions = {} for i, get_data, model, _ in stack: predictions[i] = np.array(list(itertools.chain(*[r[1][i] for r in results]))) testY = list(itertools.chain(*[r[2] for r in results])) model_names = [ ('%s.%s.%s' % (get_data.func_name, model.__name__, i), i) for i, get_data, model, repeat in stack ] model_names.sort(key=lambda x: x[0]) keys = [x[1] for x in model_names] model_names = [x[0] for x in model_names] lasso = Lasso(alpha=0.0, positive=True) trainX = [] for row_id in xrange(len(testY)): train_row = [predictions[i][row_id] for i in keys] trainX.append(train_row) a, b = trainX[:CUTOFF], trainX[CUTOFF:] c, d = testY[:CUTOFF], testY[CUTOFF:] lasso.fit(a, c) pred = lasso.predict(b) pred_train = lasso.predict(a) #logging.info('auc: %s' % util.compute_auc(d, pred)) logging.info('coefficients:') weights = {} for i, name in enumerate(model_names): logging.info('%s: %.3f' % (model_names[i], lasso.coef_[i])) weights[keys[i]] = lasso.coef_[i] logging.info('individual scores:') for i, key in enumerate(keys): logging.info('%s: %.3f' % ( model_names[i], util.compute_auc(testY, predictions[key]) )) logging.info('weights dictionary: %s' % weights) # and again in the end, so you don't have to scroll logging.info('------------') #logging.info('auc: %s' % util.compute_auc(d, pred)) logging.info('auc train: %s' % util.compute_auc(c, pred_train))
def pred_pH(train, val, test, all_vars, loop): data = (val, test, train) # variable selection pH_lassoed_vars = lass_varselect(train, all_vars, 'pH', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['pH']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if pH_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest neigh = RandomForestRegressor(n_estimators=100) neigh.fit(train.ix[:, chosen], train['pH']) for dset in data: dset['pH_for_prds'] = neigh.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.000000275, positive=True) lass.fit(train[all_vars], train['pH']) for dset in data: dset['pH_las_prds'] = lass.predict(dset[all_vars]) # ridge pH_ridge = RidgeCV(np.array([.6]), normalize=True) pH_ridge.fit(train[all_vars], train['pH']) for dset in data: dset['pH_rdg_prds'] = pH_ridge.predict(dset[all_vars]) # combination models= [ 'pH_rdg_prds', 'pH_las_prds', 'pH_for_prds', 'pH_for_prds' ] name = 'pH_prds' + str(object=loop) write_preds(models, name, train, val, test, 'pH')
def lassoRegression(X,y): print("\n### ~~~~~~~~~~~~~~~~~~~~ ###") print("Lasso Regression") ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### myDegree = 40 polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False) Xp = polynomialFeatures.fit_transform(X) myScaler = StandardScaler() scaled_Xp = myScaler.fit_transform(Xp) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### lassoRegression = Lasso(alpha=1e-7) lassoRegression.fit(scaled_Xp,y) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### dummyX = np.arange(0,2,0.01) dummyX = dummyX.reshape((dummyX.shape[0],1)) dummyXp = polynomialFeatures.fit_transform(dummyX) scaled_dummyXp = myScaler.transform(dummyXp) dummyY = lassoRegression.predict(scaled_dummyXp) outputFILE = 'plot-lassoRegression.png' fig, ax = plt.subplots() fig.set_size_inches(h = 6.0, w = 10.0) ax.axis([0,2,0,15]) ax.scatter(X,y,color="black",s=10.0) ax.plot(dummyX, dummyY, color='red', linewidth=1.5) plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600) ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return( None )
def traverse_movies_lasso(): LBMAP = getLBMap() DMAP = createEmpty() P_ERRORS, ERRORS = [], [] training_data, training_response = [], [] for i in range(len(data)): movie = data[i] m_rev = movie['revenue'] myvector = vectorizeMovie(movie, LBMAP, DMAP) if i > 3695: model = Lasso(alpha = .05) model.fit(training_data, training_response) raw = math.fabs(model.predict(myvector) - m_rev) ERRORS.append(raw) #P_ERRORS.append(round(raw/m_rev, 4)) training_data.append(myvector) training_response.append(m_rev) DMAP = update(movie, DMAP) #print 'all', avg_float_list(P_ERRORS) print 'all', avg_float_list(ERRORS)
def reg_skl_lasso(param, data): [X_tr, X_cv, y_class_tr, y_class_cv, y_reg_tr, y_reg_cv] = data lasso = Lasso(alpha=param["alpha"], normalize=True) lasso.fit(X_tr, y_reg_tr) pred = lasso.predict(X_cv) RMSEScore = getscoreRMSE(y_reg_cv, pred) return RMSEScore, pred
def lassoreg(a): print ("Doing lasso regression") clf2 = Lasso(alpha=a) clf2.fit(base_X, base_Y) print ("Score = %f" % clf2.score(base_X, base_Y)) clf2_pred = clf2.predict(X_test) write_to_file("lasso.csv", clf2_pred)
def calc_linear_regression(files, data_matrix, target, results): lr = Lasso() lr.fit(data_matrix, target) rss = np.mean((lr.predict(data_matrix) - target) ** 2) var = lr.score(data_matrix, target) global best if rss < best: for i in range(0,len(target)): print str(target[i]) + "\t" + str(lr.predict(data_matrix[i])[0]) print lr.coef_ best = rss results.append((files, rss, var, lr.coef_))
def classify(self): """Perform classification""" clf = Lasso(max_iter=10000000) #parameters = {'alpha':[0.001,0.005,0.01,0.05,0.1,0.5,1,5.0,10.0]} #clf = GridSearchCV(lasso, parameters,scoring='roc_auc') clf.fit(self._ClassifyDriver__traindata, self._ClassifyDriver__trainlabels) self._ClassifyDriver__y = clf.predict(self._ClassifyDriver__testdata)
class Linear(): def __init__(self, type='Ridge', alpha=3, C=1.0, nu=0.2, limit=None, \ epsilon=0.1): self.limit = limit if type == 'Ridge': self.model = Ridge(alpha=alpha) elif type == 'SVR': self.model = SVR(kernel='linear', C=C, epsilon=epsilon) elif type == 'NuSVR': self.model = NuSVR(C=C, nu=nu, kernel='linear') elif type == 'Lasso': self.model = Lasso(alpha=alpha) @staticmethod def get_cal(m): # get calitative features # watch out as indices depend on feature vector! return np.hstack((m[:,:23], m[:,24:37], m[:,38:52])) + 1 @staticmethod def get_cant(m): # get cantitative features # watch out as indices depend on feature vector! return np.hstack((m[:,23:24], m[:,37:38], m[:,52:])) def fit(self, train_X, train_Y): # no fitting done here, just saving data if self.limit: if len(train_X) > self.limit: train_X = train_X[-self.limit:] train_Y = train_Y[-self.limit:] self.train_X = np.array(train_X) self.train_Y = np.array(train_Y) def predict(self, test_X): # fitting done here # not efficient on the long term test_X = np.array(test_X) enc = OneHotEncoder() scal = MinMaxScaler() data = np.vstack((self.train_X, test_X)) enc.fit(self.get_cal(data)) scal.fit(self.get_cant(data)) new_train_X1 = enc.transform(self.get_cal(self.train_X)) new_train_X2 = scal.transform(self.get_cant(self.train_X)) new_train_X = scipy.sparse.hstack((new_train_X1, new_train_X2)) new_test_X1 = enc.transform(self.get_cal(test_X)) new_test_X2 = scal.transform(self.get_cant(test_X)) new_test_X = scipy.sparse.hstack((new_test_X1, new_test_X2)) self.model.fit(new_train_X, self.train_Y) R = self.model.predict(new_test_X) return R
def test_lasso_regression(): datafile_viper = '../data_viper/viper.pkl' viper = loadfile(datafile_viper) from sklearn.linear_model import Lasso model = Lasso(alpha=1e-3) model.fit(viper.train_feat, viper.train_y) y_pred = model.predict(viper.test_feat) print 'testing error {}'.format(abs_error(y_pred, viper.test_y))
def main(folds = 5): print "folds: ", folds #read in data, parse into training and target sets print "\n ------------------Load file --------------- \n" train = np.loadtxt(sys.argv[1]).T min_max_scaler = preprocessing.MinMaxScaler() train = min_max_scaler.fit_transform(train) #test data set xtest = train[100:112, :] train = train[0:100, :] print "Size of read data: ", train.shape #train = imputation_missingValue(train) print "After Standardization:" print train target = np.loadtxt(sys.argv[2]).T ytest = target[100:112, :] target = target[0:100,:] print "Size of read data: ", target.shape al = 0.3 rf = Lasso(alpha=al) #Simple K-Fold cross validation. cv = cross_validation.KFold(len(train), folds) #iterate through the training and test cross validation segments and #run the classifier on each one, aggregating the results into a list results = [] i = 0 min_MSE = sys.maxint best_train = -1 best_test = -1 for traincv, testcv in cv: start = timeit.default_timer() i += 1 print i, "epoch" rf.fit(train[traincv], target[traincv]) prediction = rf.predict(train[testcv]) MSE = mean_squared_error(target[testcv], prediction) print "MSE: ", MSE, " for ",i if min_MSE > MSE: best_train = traincv best_test = testcv min_MSE = MSE results.append(MSE) stop = timeit.default_timer() print "Program running time: ", stop - start #print out the mean of the cross-validated results print "Results: " + str( np.array(results).mean() ), "for folds: ", folds print "Results for independent data: ", mean_squared_error(rf.fit(train[best_train], target[best_train]).predict(xtest), ytest) print "R squared:" print "alpha:", al
def fit_predict_model(l1_penalty): RSS = np.zeros((len(l1_penalty))) num_nonzero_coeff = np.zeros((len(l1_penalty))) idx = 0 for l1_penalty_choice in l1_penalty: model = Lasso(alpha=l1_penalty_choice, normalize=True) model.fit(training[all_features], training['price']) predicted_price = model.predict(validation[all_features]) RSS[idx] = np.sum((predicted_price - validation['price'])**2) num_nonzero_coeff[idx] = np.count_nonzero(model.coef_) + np.count_nonzero(model.intercept_) idx += 1 return (RSS, num_nonzero_coeff, model)
def lasso_regression(alpha): #Fit the model lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5) lassoreg.fit(A_x, A_y) y_pred = lassoreg.predict(A_x) #Return the result in pre-defined format rss = sum((y_pred-A_y)**2) ret = [rss] ret.extend([lassoreg.intercept_]) ret.extend(lassoreg.coef_) return ret
def lasso_regression(data, predictors, alpha): #Fit the model lassoreg = Lasso(alpha=alpha,normalize=True, max_iter=1e5) lassoreg.fit(data[predictors],data['TransformedLife']) y_pred = lassoreg.predict(data[predictors]) #Return the result in pre-defined format rss = sum((y_pred-data['TransformedLife'])**2) ret = [rss] ret.extend([lassoreg.intercept_]) ret.extend(lassoreg.coef_) return ret
def linearReg(): sl=Lasso(alpha=0.2) sl.fit(features_array,values_array) predict_val=sl.predict(features_array) print(sl.coef_) print(sl.score(features_array,values_array)) fig = plt.figure() ax = plt.subplot(111) ax.bar(range(0,features.shape[1]),sl.coef_) plt.show()
def Lasso_model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice real_train_tar=np.expm1(train_linear_tar) x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) write_pkl(lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl') return test_prediction_lasso
def comparaison_moindres_carres(X,Y): X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=random.seed()) clf_lasso = Lasso(selection='random', random_state=random.seed()) clf_ridge = Ridge() clf_reg_lin = LinearRegression(n_jobs=-1) clf_lasso.fit(X_train,Y_train) clf_ridge.fit(X_train,Y_train) clf_reg_lin.fit(X_train,Y_train) Y_lasso=clf_lasso.predict(X_test) Y_ridge=clf_ridge.predict(X_test) Y_reg_lin=clf_reg_lin.predict(X_test) err_lasso=mean_squared_error(Y_test,Y_lasso) err_ridge=mean_squared_error(Y_test,Y_ridge) err_reg_lin=mean_squared_error(Y_test,Y_reg_lin) print("Erreur de Lasso={:1.2f}\nErreur de Ridge={:1.2f}\nErreur de regression lineaire={:1.2f}\n".format(err_lasso,err_ridge,err_reg_lin))
def pred_sand(train, val, test, all_vars, loop): data = (val, test, train) # variable selection sand_lassoed_vars = lass_varselect(train, all_vars, 'Sand', .00000001) univ_selector = SelectKBest(score_func = f_regression, k = 1200) univ_selector.fit(train[all_vars], train['Sand']) pvals = univ_selector.get_support() chosen = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if sand_lassoed_vars[x]: lass_only.append(all_vars[x]) # randomforest forst = RandomForestRegressor(n_estimators=100) forst.fit(train.ix[:, chosen], train['Sand']) for dset in data: dset['sand_for_prds'] = forst.predict(dset.ix[:, chosen]) # SVM svr = svm.SVR(C=14, epsilon=.43, kernel='linear') svr.fit(train.ix[:, lass_only], train['Sand']) for dset in data: dset['sand_svr_prds'] = svr.predict(dset.ix[:, lass_only]) # lasso lass = Lasso(alpha=.0000001, positive=True) lass.fit(train[all_vars], train['Sand']) for dset in data: dset['sand_las_prds'] = lass.predict(dset[all_vars]) # ridge sand_ridge = RidgeCV(np.array([.7]), normalize=True) sand_ridge.fit(train[all_vars], train['Sand']) for dset in data: dset['sand_rdg_prds'] = sand_ridge.predict(dset[all_vars]) # combination models= ['sand_las_prds', 'sand_rdg_prds', 'sand_for_prds', 'sand_for_prds', 'sand_svr_prds'] #print train.ix[0:20, models] name = 'sand_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Sand')
def pred_Ca(train, val, test, all_vars, loop): data = (val, test, train) # variable selection Ca_lassoed_vars = lass_varselect(train, all_vars, 'Ca', .0000000001) univ_selector = SelectKBest(score_func = f_regression, k = 1400) univ_selector.fit(train[all_vars], train['Ca']) pvals = univ_selector.get_support() chosen = [] for x in range(1, len(all_vars)): if Ca_lassoed_vars[x] | pvals[x]: chosen.append(all_vars[x]) lass_only = [] for x in range(0, len(all_vars)): if Ca_lassoed_vars[x]: lass_only.append(all_vars[x]) # nearest randomforest forst = RandomForestRegressor(n_estimators=120) forst.fit(train.ix[:, chosen], train['Ca']) #print forst.feature_importances_ for dset in data: dset['Ca_for_prds'] = forst.predict(dset.ix[:, chosen]) # lasso lass = Lasso(alpha=.0000001, positive=True) lass.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_las_prds'] = lass.predict(dset[all_vars]) # ridge Ca_ridge = RidgeCV(np.array([.5]), normalize=True) Ca_ridge.fit(train[all_vars], train['Ca']) for dset in data: dset['Ca_rdg_prds'] = Ca_ridge.predict(dset[all_vars]) # combination models= ['Ca_las_prds', 'Ca_rdg_prds', 'Ca_for_prds', 'Ca_for_prds', ] name = 'Ca_prds' + str(object=loop) write_preds(models, name, train, val, test, 'Ca')
def lasso_regression(data,target,alphas): plt.figure() mean_rmses=[] kf=KFold(len(target),10,True,None) for alpha0 in alphas: rmses=[] clf=Lasso(alpha=alpha0,normalize=True) for train_index, test_index in kf: data_train,data_test=data[train_index],data[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) # print(clf.sparse_coef_) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) mean_rmses.append(np.mean(rmses)) x0=np.arange(1,11) plt.plot(x0,rmses,label='alpha='+str(alpha0),marker='o') lr = linear_model.LinearRegression(normalize = True) rmses = [] for train_index, test_index in kf: data_train, data_test = data[train_index], data[test_index] target_train, target_test = target[train_index], target[test_index] lr.fit(data_train, target_train) rmse = sqrt(np.mean((lr.predict(data_test) - target_test) ** 2)) rmses.append(rmse) mean_rmses.append(np.mean(rmses)) x0=np.arange(1,11) plt.plot(x0,rmses,label='linear',marker='*') plt.title("RMSE comparison between different alpha values of Lasso regularization") plt.xlabel("cross validation indices") plt.ylabel("RMSE") plt.legend() plt.show() return mean_rmses
alphas = [1e-10, 1e-7, 1e-5, 1e-3, 1e-1, 1] xtrain, xtest, ytrain, ytest = train_test_split(resultx, resulty, train_size=0.9, random_state=22) xtrain = np.reshape(xtrain, [-1, len(usedcolumns)]) xtest = np.reshape(xtest, [-1, len(usedcolumns)]) general_error = [] for i, alpha in enumerate(alphas): mses = [] #do cross validation to find the best alpha for trains, valids in KFold(4, shuffle=True).split( range(xtrain.shape[0])): lreg = Lasso(alpha=alpha, normalize=True) lreg.fit(xtrain[trains], ytrain[trains]) y_pred = lreg.predict(xtrain[valids]) mses.append(mse(y_pred, ytrain[valids])) general_error.append(np.mean(mses)) #using the entire training dataset to fit the Lasso model with alpha x indexs2 = np.argmin(general_error) best_alpha = alphas[int(indexs2)] lreg2 = Lasso(alpha=best_alpha, normalize=True) lreg2.fit(xtrain, ytrain) y_pred2 = lreg2.predict(xtrain) #record these data mseg.append(mse(y_pred2, ytrain)) R2.append(r2_score(y_pred2, ytrain)) models.append(lreg2) test_set.append([xtest, ytest]) bests_alpha.append(best_alpha)
def run_stack(SEED): trainBaseTarget = pd.read_csv('../preprocessdata/pre_shuffled_target.csv') trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv') trainBaseWeight = trainBase['var11'] #test = pd.read_csv('../data/pre_shuffled_test.csv') columns = trainBase.columns.values.tolist() columnsHighScore = trainBase.columns.values.tolist() print(trainBase.columns) trainBase = np.nan_to_num(np.array(trainBase)) targetBase = np.nan_to_num(np.array(trainBaseTarget)) #test = np.nan_to_num(np.array(test)) gc.collect() avg = 0 avgLast = -1 NumFolds = 5 clf = Lasso(alpha=0.00010) # found with tune_lasso.py print ("Data size: " + str(len(trainBase))) print("Begin Training") lenTrainBase = len(trainBase) gc.collect() featuresRemaining = [] avgScore = [] while True: print(clf) avg = 0 coef_dataset = np.zeros((len(columns),NumFolds)) foldCount = 0 Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True) for train_index, test_index in Folds: print() print ("Iteration: " + str(foldCount)) now = datetime.datetime.now() print(now.strftime("%Y/%m/%d %H:%M:%S")) target = [targetBase[i] for i in train_index] train = [trainBase[i] for i in train_index] weight = [trainBaseWeight[i] for i in train_index] targetTest = [targetBase[i] for i in test_index] trainTest = [trainBase[i] for i in test_index] weightTest = [trainBaseWeight[i] for i in test_index] #print "LEN: ", len(train), len(target) target = np.array(np.reshape(target, (-1, 1)) ) #train = np.array(np.reshape(train, (-1, 1)) ) weight = np.array(np.reshape(weight, (-1, 1))) targetTest = np.array(np.reshape(targetTest, (-1, 1)) ) #trainTest = np.array(np.reshape(trainTest, (-1, 1)) ) weightTest = np.array(np.reshape(weightTest, (-1, 1))) #clf.fit(train, target, sample_weight = weight clf.fit(train, target) predicted = clf.predict(trainTest) print(str(score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel()))) avg += score.normalized_weighted_gini(targetTest.ravel(), predicted.ravel(), weightTest.ravel())/NumFolds coef_dataset[:, foldCount] = clf.coef_ foldCount = foldCount + 1 coefs = coef_dataset.mean(1) sorted_coefs = sorted(map(abs, coefs)) # must start by removing coefficients closest to zero. print(coefs) print("len coefs: " + str(len(sorted_coefs))) if len(sorted_coefs) < 5 : break threshold = sorted_coefs[5] print(str(len(columns))) print(trainBase.shape) toDrop = [] # hey, cannot drop var11 and id columns for index in range(len(coefs) - 1, -1, -1): # must reverse columns all shift to lower numbers. if abs(coefs[index]) <= threshold and columns[index] != "var11" and columns[index] != "id":# abs(), remove closest to zero. print("Drop: " + str(index) + " " + columns[index] + " " + str(coefs[index])) #trainBase = np.delete(trainBase,[index], axis=1) toDrop.append(index) #print(columns) if columns[index] in columns: columns.remove(columns[index]) #print(columns) print("start drop") trainBase = np.delete(trainBase,toDrop, axis=1) print("End drop") if avg > avgLast: print("Saving Copy " + str(avgLast) + " " + str(avg)) avgLast = avg columnsHighScore = columns.copy() print("Threshold: " + str(threshold)) print ("------------------------Average: " + str(avg)) print(columnsHighScore) print(str(len(columns))) print(trainBase.shape) featuresRemaining.append(len(columns)) avgScore.append(avg) #break gc.collect() trainBase = pd.read_csv('../preprocessdata/pre_departition_train.csv') trainBase = trainBase.loc[:,columnsHighScore] trainBase.to_csv("../models/" + str(clf)[:5] + "_train.csv", index = False) gc.collect() test = pd.read_csv('../preprocessdata/pre_departition_test.csv') test = test.loc[:,columnsHighScore] test.to_csv("../models/" + str(clf)[:5] + "_test.csv", index = False) print(columnsHighScore) print(featuresRemaining) print(avgScore)
prediction_lgb = np.exp(gbm.predict(test_feature)) #%% # RandomForestRegressorによる予測 forest = RandomForestRegressor().fit(X_train, y_train) prediction_rf = np.exp(forest.predict(test_feature)) acc_forest = forest.score(X_train, y_train) acc_dic.update(model_forest = round(acc_forest,3)) print(f"training dataに対しての精度: {forest.score(X_train, y_train):.2}") #%% # lasso回帰による予測 lasso = Lasso().fit(X_train, y_train) prediction_lasso = np.exp(lasso.predict(test_feature)) acc_lasso = lasso.score(X_train, y_train) acc_dic.update(model_lasso = round(acc_lasso,3)) print(f"training dataに対しての精度: {lasso.score(X_train, y_train):.2}") #%% # ElasticNetによる予測 En = ElasticNet().fit(X_train, y_train) prediction_en = np.exp(En.predict(test_feature)) print(f"training dataに対しての精度: {En.score(X_train, y_train):.2}") acc_ElasticNet = En.score(X_train, y_train) acc_dic.update(model_ElasticNet = round(acc_ElasticNet,3)) #%%
coefs2.append(lasso.coef_) ax2 = plt.gca() ax2.plot(alphas*2, coefs) ax2.set_xscale('log') ax2.set_title('Lasso') plt.axis('tight') plt.xlabel('alpha') plt.ylabel('weights') ##Lasso regression with cross-validation## #LassoCV with 10-fold cross-validation(similar to ISLR)# lcv = LassoCV(alphas=None, max_iter=100000, normalize=True, cv=kfcv, n_jobs=2) lcv.fit(X_train, Y_train) print('\nBest LassoCV alpha value:') print(lcv.alpha_) #Ridge regression using best alpha# lbest = Lasso(alpha=lcv.alpha_, normalize=True) lbest.fit(X_train, Y_train) print('\nBest Lasso MSE:') print(mean_squared_error(Y_test, lbest.predict(X_test))) print('\nLasso Coeficients:') print(pd.Series(lbest.coef_, index=xcols)) plt.show()
def Lasso_Reg(alpha) : L1 = Lasso(alpha=alpha, normalize=True) L1.fit(X3_train, y3_train) pred = L1.predict(X3_test) return L1, pred
#Let us predict the stock market for the Future 30 days days = 20 data_seed = df['Adj Close'].values[-window_size:][None] input_values = { 'Lasso': data_seed, 'Ridge': data_seed, 'BayesianRidge': data_seed, 'ElasticNet': data_seed } values = {'Lasso': [], 'Ridge': [], 'BayesianRidge': [], 'ElasticNet': []} for i in range(days): values['Lasso'].append(reg_1.predict(input_values['Lasso'])[0]) values['Ridge'].append(reg_2.predict(input_values['Ridge'])[0]) values['BayesianRidge'].append( reg_3.predict(input_values['BayesianRidge'])[0]) values['ElasticNet'].append(reg_4.predict(input_values['ElasticNet'])[0]) for v in input_values: val = input_values[v] val = np.insert(val, -1, values[v][-1], axis=1) val = np.delete(val, 0, axis=1) input_values[v] = val.copy() for v in input_values: values[v] = np.array(values[v]) # Plotting the Predictions of all the four Regressors in sub plots last_date = datetime.strptime("{:%Y-%m-%d}".format(df.index[-1]), '%Y-%m-%d')
lassocv = LassoCV(alphas=None, cv=10, max_iter=100000, normalize=False, random_state=1991, positive=True) lassocv.fit(regressors_train_pca, target_train) # Fit Lasso model with best alpha lasso = Lasso(max_iter=10000, normalize=False, alpha=lassocv.alpha_, positive=True) # a = 17671.398612860448 lasso.fit(regressors_train_pca, target_train) # Predict on test set predicted_lasso = lasso.predict(regressors_test_pca) # RMSE math.sqrt(mean_squared_error(target_test, predicted_lasso)) # MAE mean_absolute_error(predicted_lasso, target_test) #MAPE np.mean(np.abs((target_test - predicted_lasso) / target_test)) predicted_df_lasso = pd.DataFrame({ 'Predicted_Values': list(predicted_lasso.flatten().astype(int)), 'Actual_Values': list(target_test) }).set_index( target_test.index
cv_scores = cross_val_score(reg,X,y, cv=5) # Print the 5-fold cross-validation scores print(cv_scores) print("Average 5-Fold CV Score: {}".format(np.mean(cv_scores))) #regularised linear regression #we will use 2 types of linear regression, ridge and lasso #lasso regression. Here we will try to predict the data from testdata.csv with lasso regression. Alo we will draw a graph to show the selection of features with lasso regression lasso = Lasso(alpha = 0.4, normalize = True) lasso.fit(X,y) y_pred_all = lasso.predict(X_dummy_all) print('predicted value using all the features and using lasso regression is : '+str(y_pred_all)) #code to draw the graph plt.clf() lasso_coef = lasso.coef_ df_columns = np.array(['population', 'fertility', 'HIV', 'CO2', 'BMI_male', 'GDP','BMI_female', 'child_mortality']) plt.plot(range(len(df_columns)), lasso_coef) plt.xticks(range(len(df_columns)), df_columns, rotation=60) plt.margins(0.02) plt.savefig('lassofig.png') #ridge regression
color="chocolate") plt.show() print() ####### #Ridge# ####### print('##################') print('#RIDGE Regression#') print('##################') ridge = Ridge().fit(X_train, y_train) y_predicted_ridge = ridge.predict(X_test) y_predicted_binary_ridge = [1 if yp >= 0.5 else 0 for yp in y_predicted_ridge] print('The accuracy score of Ridge Regression is: {:.3f}'.format( accuracy_score(y_test_binary, y_predicted_binary_ridge))) print() ####### #Lasso# ####### print('##################') print('#LASSO Regression#') print('##################') lasso = Lasso().fit(X_train, y_train.value.apply(getBinary)) y_predicted_lasso = lasso.predict(X_test) y_predicted_binary_lasso = [1 if yp >= 0.5 else 0 for yp in y_predicted_lasso] print('The accuracy score of Lasso Regression is: {:.3f}'.format( accuracy_score(y_test_binary, y_predicted_binary_lasso)))
# Run prediction on the Kaggle test set. y_pred_xgb = regr.predict(test_df_munged) ################################################################################ from sklearn.linear_model import Lasso # I found this best alpha through cross-validation. best_alpha = 0.00099 regr = Lasso(alpha=best_alpha, max_iter=50000) regr.fit(train_df_munged, label_df) # Run prediction on training set to get a rough idea of how well it does. y_pred = regr.predict(train_df_munged) y_test = label_df print("Lasso score on training set: ", rmse(y_test, y_pred)) # Run prediction on the Kaggle test set. y_pred_lasso = regr.predict(test_df_munged) ################################################################################ # Blend the results of the two regressors and save the prediction to a CSV file. y_pred = (y_pred_xgb + y_pred_lasso) / 2 y_pred = np.exp(y_pred) pred_df = pd.DataFrame(y_pred, index=test_df["Id"], columns=["SalePrice"]) pred_df.to_csv('output_XGBoost.csv', header=True, index_label='Id')
lasso.fit(boston.data, boston.target) lasso_coef = lasso.coef_ lasso_coef # In[39]: plt.plot(range(13), lasso_coef) plt.xticks(range(13), boston.feature_names) plt.ylabel = ('coefficents') plt.show() # In[41]: lasso = Lasso(alpha=0.1, normalize=True) lasso.fit(x_train, y_train) y_lasso = lasso.predict(x_test) lasso_mse = mean_squared_error(y_test, y_lasso) lasso_mse # In[43]: from sklearn.linear_model import Ridge # In[44]: ridge = Ridge(alpha=0.1, normalize=True) ridge.fit(x_train, y_train) y_ridge = ridge.predict(x_test) ridge_mse = mean_squared_error(y_test, y_ridge) ridge_mse
# learn scalers. Squishes feature data set into the same scale. Without this the RMSE doubles!! feature_scaler = RobustScaler(quantile_range=(25, 75)).fit(feature_train) # perform scaling feature_train = feature_scaler.transform(feature_train) feature_test = feature_scaler.transform(feature_test) #Run Lasso model from sklearn.linear_model import Lasso from sklearn.metrics import r2_score, classification_report, mean_squared_error alpha = 0.01 lasso = Lasso(alpha=alpha) lasso.fit(feature_train, target_train) pred_train_lasso = lasso.predict(feature_train) #here we're asking the model to predict the results of using the test data set. i.e. if we passed it next years survey numbers, it would predict the happiness score # We'd need to pass in the features in the same order as we define above: print(lasso.predict([ [10, 10 ] ])) pred_test_lasso = lasso.predict(feature_test) #RMSE. Reslts are c.0.25, with the dependancy (happiness) ranging from 2.8-7.5. Therefore RMSE is small (good!) # print("RMSE train set: ", np.sqrt(mean_squared_error(target_train,pred_train_lasso))) # print("RMSE test set: ", np.sqrt(mean_squared_error(target_test,pred_test_lasso))) # #R-squared values. values =0.95, 1 means the model explains the variability in happiness perfectly. So R-squared is high # print("R-squared train set: ", r2_score(target_train, pred_train_lasso)) # print("R-squared test set: ", r2_score(target_test, pred_test_lasso)) train_score = lasso.score(feature_train, target_train) test_score = lasso.score(feature_test, target_test)
lassoLams = np.logspace(-5,6,23) def Learn(lam): lasso = Lasso(alpha=lam,fit_intercept=False,copy_X=True,max_iter=5.e3) lasso.fit(X_train,y_train) return lasso optLam = ExperimentUtils.gridSearch1D(lassoLams, Learn, Eval, MAX=False,verbose=False) lasso = Lasso(alpha=optLam,fit_intercept=False,copy_X=True,max_iter=2.5e5) lasso.fit(X,y) lasso_yhat = np.array([lasso.predict(X_test)]).T lasso_mse = sum((y_test - lasso_yhat) ** 2) lasso_beta = np.array([lasso.coef_]).T lasso_betaLoss = max(abs(lasso_beta - oracleBeta))[0] with open(mseFile('LASSO'),'a') as f: f.write("%15.10f " % lasso_mse) with open(betaFile('LASSO'),'a') as f: f.write("%15.10f " % lasso_betaLoss) with open(lamFile('LASSO'),'a') as f: f.write("%15.10f " % optLam) print "LASSO MSE: %f BETA_LOSS: %f OPT_LAM: %f" % (lasso_mse, lasso_betaLoss, optLam) ############ ## Oracle ## ############
#tune the lambda parameter by applying k-fold cross validation kf = KFold(N, n_folds=5) #produce the k folds Lambda = np.arange(0.001, 1.0, 0.001) #a list of lambdas Prediction_error = [] #an empty list to hold the prediction error for l in Lambda: #loop over lambdas pe = 0.0 #initialize prediction error for train_index, test_index in kf: #loop over the folds X_train, X_test = X[train_index], X[ test_index] #create training and test independent variable data y_train, y_test = y[train_index], y[ test_index] #create training and test dependent variable data model = Lasso(l) #create the model object results = model.fit(X_train, y_train) #fit the model pe += sum( (model.predict(X_test) - y_test )**2) #predict the test data, compute the error, and add to total Prediction_error.append(pe) #append the prediction error #run the lasso: #Lambda = sum(((1.0/np.array(Prediction_error))/sum(1.0/np.array(Prediction_error)))*np.array(Lambda)) #compute lambda as the weighted average model = Lasso(Lambda[Prediction_error.index( min(Prediction_error))]) #generate a model object results = model.fit(X, y) #fit the model for i, j in zip(results.coef_, data[2]): #loop over results print('Lasso:', round(i, 4), ' True', round(j, 4)) #print and compare with the truth
feature_selector = Lasso(alpha=alpha, fit_intercept=True, max_iter=1, \ warm_start=warm_start, positive=False, tol=0.0) while it_since_min < delta_it: i += 1 print 'fitting' sys.stdout.flush() feature_selector.fit(data_train, salaries_train) print 'predicting' sys.stdout.flush() salaries_pred = feature_selector.predict(data_valid)/scale_fac error = np.average(np.abs(salaries_valid/scale_fac - salaries_pred)) average_salary = np.average(salaries/scale_fac) coeff = feature_selector.coef_ intercept = feature_selector.intercept_ it_array = np.append(it_array, [i]) valid_array = np.append(valid_array, [error]) plt.clf() plt.plot(it_array, valid_array) plt.xlabel('iteration count') plt.ylabel('mean validation error') plt.title('Lasso (linear) Model Selection alpha = ' + str(alpha)) plt.pause(0.001) fig.savefig('../../plots/lasoo_linear_model_' + str(alpha) + '_' + str(error) + '.pdf')
from sklearn.cross_validation import KFold from sklearn.linear_model import Lasso import numpy as np import pylab as pl from sklearn.datasets import load_boston #Loading boston datasets boston = load_boston() # Adding a column of 1s for x0 (Regression Design Matrix) x = np.array([np.concatenate((v,[1])) for v in boston.data]) y = boston.target # Create linear regression object with a lasso coefficient 0.5 lasso = Lasso(fit_intercept=True, alpha=0.5) # Train the model using the training set lasso.fit(x,y) # predictions p = np.array([lasso.predict(xi) for xi in x]) p = lasso.predict(x) #plotting real vs predicted data pl.plot(p, y,'ro') pl.xlabel('predicted') pl.title('Lasso Regression, alpha=0.5') pl.ylabel('real') pl.grid(True) pl.show() #vector of errors err = p-y # Dot product of error vector is sum of squared errors total_error = np.dot(err,err) #RMSE on training data rmse_train = np.sqrt(total_error/len(p)) # Compute RMSE using 10-fold x-validation kf = KFold(len(x), n_folds=10)
class MetaModel_Lasso(customRegressor): def __init__(self, in_df, models, sub_params, n_folds, qualPow, imputeDict): super(MetaModel_Lasso,self).__init__() self.qualPow = qualPow self.imputeDict = imputeDict # self.features = features self.models = models self.subparams = sub_params self.meta = None # self.model = self.meta # aliases shallow copied for use with base class self.n_folds = n_folds self.predBool = False from meta_features import impute_shell self._imputeVals = impute_shell(qualPow) tempDF = self._imputeVals(in_df) self.X = tempDF.drop(columns=["SalePrice"]).copy() self.y = np.log(tempDF.SalePrice).values.reshape(-1,1) self.pipeline_X = self._make_pipe() self.pipeline_y = RobustScaler() def _make_pipe(self): import meta_features as f nonePipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value="None"), OneHotEncoder(drop="first")) zeroPipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value=0), OneHotEncoder(drop="first", categories="auto")) scalePipeline = make_pipeline(SimpleImputer( strategy="constant", fill_value=0), PowerTransformer()) regressionPipeline = ColumnTransformer([ ("setNone", nonePipeline, f.fillNone), # ("setZero", zeroPipeline, f.fillZeroCat), ("transformed", scalePipeline, f.fillZeroCont), # ("dictImputed", make_pipeline(self.dictImputer(f.imputeDict), # OneHotEncoder(drop="first")), list(f.imputeDict.keys())), # ("bool", "passthrough", f.imputeBool), ("categoricalInts", "passthrough", f.cat_to_int), # ("dropped", "drop", f.dropList) ], remainder="drop") return make_pipeline(regressionPipeline, RobustScaler()) def genPreds(self,X,y): self.predBool = True self.model_list = [list() for i in self.models] folds = KFold(n_splits = self.n_folds, shuffle=True, random_state=6) oob_preds = np.zeros((X.shape[0], len(self.models))) for i, model in enumerate(self.models): for trainIdx, outIdx in folds.split(X): local_model = deepcopy(model) self.model_list[i].append(local_model) local_model.subset(trainIdx) local_model.fitModel(self.subparams[i]) preds = local_model.predict(X.iloc[outIdx,:]) oob_preds[outIdx,i] = preds.reshape(-1,) self.oob_preds = oob_preds # self.meta.fitModel(X, oob_preds, y) def fitModel(self,params): self._params = params self.meta = Lasso(**params) if not self.predBool: self.genPreds(self.X,self.y) piped_X = self.pipeline_X.fit_transform(self.X) meta_X = np.column_stack([piped_X,self.oob_preds]) piped_y = self.pipeline_y.fit_transform(self.y) self.meta.fit(meta_X,piped_y) def getTrainRsquared(self): piped_X = self.pipeline_X.transform(self.X) meta_X = np.column_stack([piped_X, self.oob_preds]) piped_y = self.pipeline_y.transform(self.y) return self.meta.score(meta_X,piped_y) def predict(self,X): piped_X = self.pipeline_X.transform(self._imputeVals(X)) pred_Data = np.column_stack([ np.column_stack([model.predict(X) for model in base_models]).mean(axis=1) for base_models in self.model_list ]) meta_X = np.column_stack([piped_X,pred_Data]) preds = self.meta.predict(meta_X) return self._invert(preds)
merged_test_data["CompetitionOpenSinceYear"] = merged_test_data["CompetitionOpenSinceYear"].fillna(merged_test_data["CompetitionOpenSinceYear"].median()) merged_test_data["StoreType"] = merged_test_data["StoreType"].fillna(merged_test_data["StoreType"].mode()) merged_test_data["Assortment"] = merged_test_data["Assortment"].fillna(merged_test_data["Assortment"].mode()) merged_test_data.loc[merged_test_data["StoreType"] == "a", "StoreType"] = 0 merged_test_data.loc[merged_test_data["StoreType"] == "b", "StoreType"] = 1 merged_test_data.loc[merged_test_data["StoreType"] == "c", "StoreType"] = 2 merged_test_data.loc[merged_test_data["StoreType"] == "d", "StoreType"] = 3 merged_test_data.loc[merged_test_data["Assortment"] == "a", "Assortment"] = 0 merged_test_data.loc[merged_test_data["Assortment"] == "b", "Assortment"] = 1 merged_test_data.loc[merged_test_data["Assortment"] == "c", "Assortment"] = 2 merged_test_data.loc[merged_test_data["Assortment"] == "d", "Assortment"] = 3 merged_test_data = merged_test_data.fillna(0) las = Lasso() predictors = ['DayOfWeek', 'Date', 'Promo', 'Promo2', 'Promo2SinceYear', 'Assortment', 'StoreType', 'CompetitionDistance'] las.fit(dataset[predictors], dataset["Sales"]) merged_test_data = merged_test_data[merged_test_data.Id != 0] predictions = las.predict(merged_test_data[predictors]) submission = pd.DataFrame({ "Id": merged_test_data["Id"].astype(int), "Sales": predictions }) submission = submission[submission.Id != 0] submission.to_csv("kaggle.csv", index=False) #scores = cross_validation.cross_val_score(las, dataset[predictors], dataset["Sales"], cv=3) #print(scores.mean())
# print 'best rmse for reduced problem: {}. alpha = {}'.format(best_reduced.rmse,best_reduced.alpha) # End Cross validation else: print 'Making test prediction' reg = Lasso(0.28) reg.fit(x_source_tf, y_source) """ Predict output with chosen features and learned coefficients beta """ # load test data and transform samples data_test = data_loader.restore_from_file('test.csv') n_samples_test = data_test.shape[0] ids_test = data_test[:, 0].reshape(n_samples_test, 1) x_test = data_test[:, 1:].reshape(n_samples_test, n_dimensions_x) x_test_tf = feature_transform(feature_vec, x_test) # predict output if transform: y_test = reg.predict(x_test_tf).reshape(n_samples_test, 1) else: y_test = reg.predict(x_test).reshape(n_samples_test, 1) # save output header = np.array(['Id', 'y']).reshape(1, 2) dump_data = np.hstack((ids_test, y_test)) data_loader.save_to_file('results.csv', dump_data, header)
#Mean Squared Error, R^2 print('MSE train: %.3f, test: %.3f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) #3. LASSO regression model X = df.iloc[:, :-1].values y = df[df.columns].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42) lasso = Lasso(alpha=1.0) lasso.fit(X_train, y_train) y_train_pred = lasso.predict(X_train) y_test_pred = lasso.predict(X_test) #residual plot plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', edgecolor='white', label='Training data') plt.scatter(y_test_pred, y_test_pred - y_test, c='green', marker='s', edgecolor='white', label='Test data') plt.title('LASSO regression residual errors')
def run_lasso(X_train, y_train, X_test): model = Lasso(alpha=1) result = model.fit(X_train,y_train) y_predicted = model.predict(X_test) return y_predicted
def preprocessing(): onehotencoder = OneHotEncoder(handle_unknown="ignore") categorical_encoded_data = onehotencoder.fit_transform( features[CATEGORICAL_FEATURES].values).toarray() scaler = StandardScaler() scaled_numerical_data = scaler.fit_transform( features.drop(CATEGORICAL_FEATURES, axis=1)) processed_data = np.concatenate( (categorical_encoded_data, scaled_numerical_data), axis=1) data = pd.read_csv('airbnb_data.csv') features = data.drop( ['price', 'listing_url', 'image_url', 'title', 'district'], axis=1) target = data['price'] features['rating'].fillna(features['rating'].mean(), inplace=True) features['reviews'].fillna(1, inplace=True) features['baths'].fillna('1 bath', inplace=True) X_train, X_test, y_train, y_test = train_test_split(processed_data, target, test_size=0.3) reg = Lasso() reg.fit(X_train, y_train) pred = reg.predict(X_test) print('Regression score of the model', reg.score(X_test, y_test)) print('Mean absolute error for the model', mean_absolute_error(y_test, pred))
r2=r2_score(y_test,y_pred) print('r^2 score=',r2) # -------------- '''Prediction using Lasso In this task let's predict the price of the house using a lasso regressor. Check if there is any improvement in the prediction.''' from sklearn.linear_model import Lasso # Code starts here #Instantiate a lasso model lasso=Lasso() #fit model on training data lasso.fit(X_train,y_train) #make predictions on test features lasso_pred=lasso.predict(X_test) print('lasso test features predictions-',lasso_pred) #Find the r^2 score r2_lasso=r2_score(y_test,lasso_pred) print('r^2 score lasso=',r2_lasso) # -------------- from sklearn.linear_model import Ridge # Code starts here ridge=Ridge() ridge.fit(X_train,y_train) ridge_pred=ridge.predict(X_test) print(ridge_pred) r2_ridge=r2_score(y_test,ridge_pred)
#Seeing the Coefficients list(zip(x_train.columns,ridge.coef_)) #Lasso regression alphas=np.linspace(0.0001,1,100) rmse_list=[] for a in alphas: lasso = Lasso(fit_intercept=True,alpha=a,max_iter=10000) #Computing RMSE using 10-fold cross validation kf=KFold(len(x_train),n_folds=10) xval_err=0 for train, test in kf: lasso.fit(x_train.loc[train],y_train[train]) p=lasso.predict(x_train.loc[test]) error = p - y_train[test] #xval_err += np.dot(err,err) xval_err += mean_squared_error() #rmse_10cv=np.sqrt(xval_err/len(x_train)) mse_10cv=xval_err/10 #Uncomment below to print rmse values for individual alphas #print('{:.3f}\t {:.6f}\t '.format(a,rmse_10cv)) mse_list.extend([mse_10cv]) best_alpha=alphas[rmse_list==min(mse_list)] print('Alpha with min 10cv error is: ',best_alpha) #Prediction
from sklearn.linear_model import Ridge ridge_reg = Ridge(alpha=1, solver="cholesky") ridge_reg.fit(X, y) y_ridge = ridge_reg.predict(X) lin_mse_ridge = mean_squared_error(y, y_ridge) lin_rmse_ridge = np.sqrt(lin_mse_ridge) lin_rmse_ridge from sklearn.linear_model import Lasso lasso_reg = Lasso(alpha=0.1) lasso_reg.fit(X, y) y_lasso = lasso_reg.predict(X) lin_mse_lasso = mean_squared_error(y, y_lasso) lin_rmse_lasso = np.sqrt(lin_mse_lasso) lin_rmse_lasso from sklearn.linear_model import ElasticNet elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5) elastic_net.fit(X, y) y_elastic_net = elastic_net.predict(X) lin_mse_elastic_net = mean_squared_error(y, y_elastic_net) lin_rmse_elastic_net = np.sqrt(lin_mse_elastic_net) lin_rmse_elastic_net
def mult_reg(p_x, p_y): """ Funcion para ajustar varios modelos lineales Parameters ---------- p_x: pd.DataFrame with regressors or predictor variables p_y: pd.DataFrame with variable to predict Returns ------- r_models: dict Diccionario con modelos ajustados """ xtrain, xtest, ytrain, ytest = train_test_split(p_x, p_y, test_size=.8, random_state=455) # fit linear regression linreg = LinearRegression(normalize=False, fit_intercept=False) linreg.fit(xtrain, ytrain) y_p_linear = linreg.predict(xtest) # Fit RIDGE regression ridgereg = Ridge(normalize=True) model = ridgereg.fit(xtrain, ytrain) y_p_ridge = model.predict(xtest) # Fit LASSO regression lassoreg = Lasso(normalize=True) lassoreg.fit(xtrain, ytrain) y_p_lasso = lassoreg.predict(xtest) # Fit ElasticNet regression enetreg = ElasticNet(normalize=True) enetreg.fit(xtrain, ytrain) y_p_enet = enetreg.predict(xtest) # RSS = residual sum of squares # Return the result of the model r_models = { "summary": { "linear rss": sum((y_p_linear - ytest)**2), "Ridge rss": sum((y_p_ridge - ytest)**2), "lasso rss": sum((y_p_lasso - ytest)**2), "elasticnet rss": sum((y_p_enet - ytest)**2) }, "test": ytest, 'linear': { 'rss': sum((y_p_linear - ytest)**2), 'predict': y_p_linear, 'model': linreg, 'intercept': linreg.intercept_, 'coef': linreg.coef_ }, 'ridge': { 'rss': sum((y_p_ridge - ytest)**2), 'predict': y_p_ridge, 'model': ridgereg, 'intercept': ridgereg.intercept_, 'coef': ridgereg.coef_ }, 'lasso': { 'rss': sum((y_p_lasso - ytest)**2), 'predict': y_p_lasso, 'model': lassoreg, 'intercept': lassoreg.intercept_, 'coef': lassoreg.coef_ }, 'elasticnet': { 'rss': sum((y_p_enet - ytest)**2), 'predict': y_p_enet, 'model': enetreg, 'intercept': enetreg.intercept_, 'coef': enetreg.coef_ } } return r_models
def compute_abundancies(self): def bin_numbers(mzs): return (mzs * 200).astype(np.int32) isotope_patterns = [self.mz_list[f][a] for f in self.sum_formulae for a in self.adducts] all_mzs = np.concatenate([pattern[0] for pattern in isotope_patterns]) # append 'infinity' so that searchsorted always returns indices less than the length all_mz_int_indices = np.concatenate((np.unique(bin_numbers(all_mzs)), [np.iinfo(np.int32).max])) def sparse_matrix_from_spectra(data, assume_presence=False): intensity_list = [] row_list = [] len_list = [] for j, (mzs, intensities) in enumerate(data): int_mzs = bin_numbers(mzs) idx = all_mz_int_indices.searchsorted(int_mzs) if not assume_presence: known = np.where(all_mz_int_indices[idx] == int_mzs)[0] intensities = intensities[known] idx = idx[known] length = len(known) else: length = len(mzs) intensity_list.append(intensities)#/np.linalg.norm(intensities)) row_list.append(idx) len_list.append(length) intensities = np.concatenate(intensity_list) rows = np.concatenate(row_list) columns = np.repeat(np.arange(len(data), dtype=np.int32), len_list) result = ssp.coo_matrix((intensities, (rows, columns)), shape=(len(all_mz_int_indices), len(data)), dtype=float) return result #print self.sum_formulae logging.info("computing Y matrix") Y = sparse_matrix_from_spectra([(s.mzs, s.intensities) for s in self.spectra]) #print Y.nnz, Y.shape logging.info("computing D matrix") D = sparse_matrix_from_spectra(isotope_patterns, assume_presence=True) #print D.nnz, D.shape n_masses, n_molecules = D.shape n_spectra = Y.shape[1] np.set_printoptions(threshold='nan', linewidth=300, precision=3, suppress=True) #print (D.todense() > 0).astype(int) neighbors_map = {} indices = -1 * np.ones((self.nrows, self.ncols), dtype=int) for s in self.spectra: x, y = s.coords[:2] indices[x, y] = s.index for x in xrange(self.nrows): for y in xrange(self.ncols): neighbors_map[indices[x, y]] = [] #for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1), (-1, -1), (1, 1), (-1, 1), (1, -1)]: for dx, dy in [(-1, 0), (1, 0), (0, -1), (0, 1)]: if 0 <= x + dx < self.nrows and 0 <= y + dy < self.ncols: idx = indices[x + dx, y + dy] if idx == -1: continue neighbors_map[indices[x, y]].append(idx) n_pairs = sum(len(x) for x in neighbors_map.values()) / 2 def w_w0_update_matrix(): xs = [] ys = [] data = [] # upper part (corresponds to DW + W0) for i in xrange(n_spectra): y_offset = n_molecules * i x_offset = n_masses * i ys.append(D.col + y_offset) xs.append(D.row + x_offset) data.append(D.data) ys.append(np.repeat(np.arange(n_spectra) + n_molecules * n_spectra, n_masses)) xs.append(np.arange(n_masses * n_spectra)) data.append(np.ones(n_masses * n_spectra)) # middle part (corresponds to W) x_offset = n_masses * n_spectra ys.append(np.arange(n_molecules * n_spectra)) xs.append(np.arange(n_molecules * n_spectra) + x_offset) data.append(np.ones(n_molecules * n_spectra)) # lower part (corresponds to the neighbor abundancy differences) x_offset = (n_masses + n_molecules) * n_spectra for i in neighbors_map: for j in neighbors_map[i]: if i > j: continue ys.append(np.arange(n_molecules) + n_molecules * i) xs.append(np.arange(n_molecules) + x_offset) data.append(np.ones(n_molecules)) ys.append(np.arange(n_molecules) + n_molecules * j) xs.append(np.arange(n_molecules) + x_offset) data.append(-1 * np.ones(n_molecules)) x_offset += n_molecules xs = np.concatenate(xs) ys = np.concatenate(ys) data = np.concatenate(data) result = ssp.coo_matrix((data, (xs, ys)), dtype=float) assert result.nnz == (D.nnz + n_masses + n_molecules) * n_spectra + n_molecules * n_pairs * 2 assert result.shape[0] == (n_molecules + n_masses) * n_spectra + n_pairs * n_molecules assert result.shape[1] == n_spectra * (n_molecules + 1) return result.tocsc() A = w_w0_update_matrix() print A.shape, A.nnz nz = np.where(Y.sum(axis=0)>0)[1]#.A1 xs= self.coords[nz,0] ys= self.coords[nz,1] # FIXME: there must be a simpler way! Y = Y.todense().A1.reshape((n_masses, n_spectra)).ravel(order='F') print "Y sum:", Y.sum() z0 = Y+1 u0 = np.zeros(n_masses * n_spectra) z1 = np.zeros(n_molecules * n_spectra) u1 = np.zeros(n_molecules * n_spectra) z2 = np.zeros(n_pairs * n_molecules) u2 = np.zeros(n_pairs * n_molecules) from sklearn.linear_model import Lasso, ElasticNet, LinearRegression lambda_ = 1.0 theta = 1e-20 rho = 1.0 print lambda_/rho/A.shape[0] w_w0_lasso = Lasso(alpha=lambda_/rho/A.shape[0], warm_start=True, fit_intercept=False, positive=True) z1_lasso = Lasso(alpha=lambda_/rho/z1.shape[0], fit_intercept=False, warm_start=True, positive=False) z2_ridge = ElasticNet(alpha=2*theta/rho/z2.shape[0], l1_ratio=0, warm_start=True, positive=False, fit_intercept=False) def w_w0_update(): rhs = np.concatenate((z0 + 1.0/rho * u0, z1 + 1.0/rho * u1, z2 + 1.0/rho * u2)) w_w0_lasso.fit(A, rhs) w = w_w0_lasso.coef_[:n_molecules*n_spectra] w0 = w_w0_lasso.coef_[n_molecules*n_spectra:] return w, w0 def z0_update(Dw_w0, u0): tmp = Dw_w0 - 1/rho * u0 - 1/rho return 0.5 * (np.sqrt(tmp ** 2 + 4 * Y / rho) + tmp) def z1_update(w, u1): z1_lasso.fit(ssp.eye(z1.shape[0]), w - 1.0 / rho * u1) return z1_lasso.coef_ def z2_update(diffs, u2): z2_ridge.fit(ssp.eye(z2.shape[0]), diffs - 1.0 / rho * u2) return z2_ridge.coef_ def logdot(x, y): #if np.any((x>0)&(y==0)): # return -np.inf return np.dot(x, np.log(y+1e-32)) # log-likelihood for the original problem (w, w0 variables) def LL(w, Dw_w0=None, diffs=None, w0=None): if Dw_w0 is None or diffs is None: assert w0 is not None rhs = A.dot(np.hstack((w, w0))) Dw_w0 = rhs[:n_masses*n_spectra] diffs = rhs[(n_masses+n_molecules)*n_spectra:] return logdot(Y, Dw_w0) - Dw_w0.sum() - lambda_ * w.sum() - theta * np.linalg.norm(diffs)**2 # log-likelihood for the modified problem (variables w, w0, z0, z1, z2, u0, u1, u2) def LL_ADMM(): return logdot(Y, z0) - z0.sum() - lambda_ * z1.sum() - theta * np.linalg.norm(z2)**2 \ - np.dot(u0, z0 - Dw_w0_estimate) \ - np.dot(u1, z1 - w_estimate) \ - np.dot(u2, z2 - diff_estimates) \ - rho/2 * np.linalg.norm(z0 - Dw_w0_estimate) ** 2 \ - rho/2 * np.linalg.norm(z1 - w_estimate) ** 2 \ - rho/2 * np.linalg.norm(z2 - diff_estimates) ** 2 max_iter = 2000 rhs = None for i in range(max_iter): logging.info("w,w0 update") w_estimate, w0_estimate = w_w0_update() rhs_old = rhs rhs = w_w0_lasso.predict(A) Dw_w0_estimate = rhs[:n_masses*n_spectra] diff_estimates = rhs[(n_masses+n_molecules)*n_spectra:] #print "w,w0 update", LL(w_estimate, Dw_w0_estimate, diff_estimates) #print w_estimate.reshape((self.nrows, self.ncols)) #print w0_estimate.reshape((self.nrows, self.ncols)) logging.info("z0 update") #print "LL_ADMM after w updates:", LL_ADMM() z_old = np.concatenate((z0, z1, z2)) z0 = z0_update(Dw_w0_estimate, u0) #print np.linalg.norm(z0 - Dw_w0_estimate) #print "LL_ADMM after z0 update:", LL_ADMM() logging.info("z1 update") z1 = z1_update(w_estimate, u1) #print np.linalg.norm(z1 - w_estimate) #print "LL_ADMM after z1 update:", LL_ADMM() #print "z1 update", LL(z1, w0=w0_estimate) logging.info("z2 update") z2 = z2_update(diff_estimates, u2) #print np.linalg.norm(z2 - diff_estimates) #print "LL_ADMM after z2 update:", LL_ADMM() u_old = np.concatenate((u0, u1, u2)) u0 += rho * (z0 - Dw_w0_estimate) u1 += rho * (z1 - w_estimate) u2 += rho * (z2 - diff_estimates) if rhs_old is not None: z = np.concatenate((z0, z1, z2)) primal_diff = np.linalg.norm(rhs - z) dual_diff = rho * np.linalg.norm(A.T.dot(z - z_old)) if primal_diff > 10 * dual_diff: rho *= 2 print "rho <-", rho w_w0_lasso = Lasso(alpha=lambda_/rho/A.shape[0], warm_start=True, fit_intercept=False, positive=True) z1_lasso = Lasso(alpha=lambda_/rho/z1.shape[0], fit_intercept=False, warm_start=True, positive=False) z2_ridge = ElasticNet(alpha=2*theta/rho/z2.shape[0], l1_ratio=0, warm_start=True, positive=False, fit_intercept=False) elif dual_diff > 10 * primal_diff: rho /= 2 print "rho <-", rho w_w0_lasso = Lasso(alpha=lambda_/rho/A.shape[0], warm_start=True, fit_intercept=False, positive=True) z1_lasso = Lasso(alpha=lambda_/rho/z1.shape[0], fit_intercept=False, warm_start=True, positive=False) z2_ridge = ElasticNet(alpha=2*theta/rho/z2.shape[0], l1_ratio=0, warm_start=True, positive=False, fit_intercept=False) print primal_diff, dual_diff, primal_diff + dual_diff, LL(w_estimate, Dw_w0_estimate, diff_estimates) #print D.todense() #print (Y-Dw_w0_estimate).reshape((n_masses, n_spectra), order='F') print LL(w_estimate, Dw_w0_estimate, diff_estimates) print w_estimate.reshape((n_molecules, self.nrows, self.ncols), order='F').sum(axis=(1,2)) #print w0_estimate.reshape((self.nrows, self.ncols), order='F') print self.sum_formulae
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1) # set the final alpha by using LassoCV Lambdas = np.logspace(-5, 5, 200) lasso_cv = LassoCV(alphas=Lambdas, normalize=True, cv=10) lasso_cv.fit(X_train, y_train) print('Alpha is:' + str(round(lasso_cv.alpha_, 4))) lasso = Lasso(alpha=lasso_cv.alpha_) # predict lasso.fit(X_train, y_train) y_predict = lasso.predict(X) y_test_predict = lasso.predict(X_test) # model evaluation (MSE,MAE,std_error) mse_predict = round(mean_squared_error(y_test, y_test_predict), 4) mae_predict = round(mean_absolute_error(y_test, y_test_predict), 4) std_error = round(Standard_error(y_test_predict), 4) coef = [] for i in range(8): coef.append((factors[i], round(lasso.coef_[i], 4))) print('Intercept is:' + str(round(lasso.intercept_, 4))) print('Estimated coefficients are:' + str(coef)) print('Std Error is:' + str(std_error)) print('MSE is:' + str(mse_predict))
scaler = preprocessing.StandardScaler().fit(X_train_raw) X_train_scaled = scaler.transform(X_train_raw) X_test_scaled = scaler.transform(X_test_raw) ## PCA and Feature Selection '''pca = PCA(n_components=100) pca.fit(X_train_scaled) #print(pca.explained_variance_ratio_) X_train_reduced = pca.transform(X_train_scaled) X_test_reduced = pca.transform(X_test_scaled) ''' pca = PCA(n_components=800) selection = SelectKBest(k=850) combined_features = FeatureUnion([("pca", pca), ("univ_select", selection)]) combined_features.fit(X_train_scaled, train_labels.ravel()) #print(pca.explained_variance_ratio_) X_train_reduced = combined_features.transform(X_train_scaled) X_test_reduced = combined_features.transform(X_test_scaled) ## Train final Classifiers #clf = Ridge(alpha=.5) clf = Lasso(alpha=.03) clf.fit(X_train_reduced, Y_train_raw) Y_predicted = clf.predict(X_test_reduced) ## Save results to csv np.savetxt('prediction.csv', Y_predicted, fmt='%.5f',delimiter=',')
Y, test_size=0.33, random_state=100) ##################################### - Losso Regression - ########################################## ### Running a LASSO Regressor of set of alpha values and observing how the R-Squared, train_rmse and test_rmse are changing with change in alpha values train_rmse = [] test_rmse = [] R_sqrd = [] alphas = np.arange(0, 500, 1) for i in alphas: LRM = Lasso(alpha=i, normalize=True, max_iter=500) LRM.fit(X_train, y_train) R_sqrd.append(LRM.score(X_train, y_train)) train_rmse.append(np.sqrt(np.mean((LRM.predict(X_train) - y_train)**2))) test_rmse.append(np.sqrt(np.mean((LRM.predict(X_test) - y_test)**2))) # Plotting Alpha vs Train and Test RMSE. plt.scatter(x=alphas, y=R_sqrd) plt.xlabel("alpha") plt.ylabel("R_Squared") plt.scatter(x=alphas, y=train_rmse) plt.xlabel("alpha") plt.ylabel("RMSE") plt.scatter(x=alphas, y=test_rmse) plt.xlabel("alpha") plt.ylabel("RMSE") plt.legend(("alpha Vs R_Squared", "alpha Vs train_rmse", "alpha Vs test_rmse")) ##Another Way of finding alpha value by using GV but above is best than this
def LassoLambda(alpha, trainSet, validationSet): lassoreg = Lasso(alpha=alpha, normalize=True, max_iter=1e5) lassoreg.fit(trainSet.loc[:, 'x':'x_10'], trainSet.loc[:, 'y']) predict_lasso = lassoreg.predict(validationSet.loc[:, 'x':'x_10'].values) error_rate = np.linalg.norm(predict_lasso - validationSet.loc[:, 'y'], ord=2) return error_rate
from sklearn.linear_model import Lasso from sklearn.model_selection import GridSearchCV lasso=Lasso() parameters={'alpha':[1e-15,1e-10,1e-8,1e-3,1e-2,1,5,10,20,30,35,40,45,50,55,100]} lasso_regression=GridSearchCV(lasso,parameters,scoring='neg_mean_squared_error',cv=5) lasso_regression.fit(X_train, y_train) print(lasso_regression.best_params_) print(lasso_regression.best_score_) # In[88]: lassoreg = Lasso(1e-08, normalize=True) lassoreg.fit(X_train, y_train) lasso_pred = lassoreg.predict(X_test) print("R-Square Value",r2_score(y_test,lasso_pred)) print("\n") print ("mean_absolute_error :",metrics.mean_absolute_error(y_test, y_pred)) print("\n") print ("mean_squared_error : ",metrics.mean_squared_error(y_test, y_pred)) print("\n") print ("root_mean_squared_error : ",np.sqrt(metrics.mean_squared_error(y_test, y_pred))) # In[89]: sns.distplot(y_test-lasso_pred)
X = X[index] y = y[index] x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.05) #train the model print("train lr model...") linear = Lasso(normalize=True, alpha=0.1) linear = linear.fit(x_train, y_train) print("train lr model...end ") gdbr = GradientBoostingRegressor(n_estimators=100) gdbr = gdbr.fit(x_train, y_train) print("train gb model... over") #test the model y_pred_lr = linear.predict(x_test) y_pred_gb = gdbr.predict(x_test) y_pred = (y_pred_gb + y_pred_lr) / 2.0 loss = np.mean(np.square((y_test - y_pred))) loss = np.power(loss, 0.5) # loss = mean_squared_error(y_test,y_pred) print loss # sys.exit() result = [] with open("/home/zsc/下载/data_new (3)/CIKM2017_testA/testA.txt", 'r') as f:
[target]).astype(np.float32) x, y = to_xy(df, 'weight') y = np.reshape(y, y.shape[0]) testx = finaltest.as_matrix().astype(np.float32) from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25) #feature selection:lasso from sklearn import metrics feature_sele = Lasso(random_state=0, alpha=0.1) feature_sele.fit(x_train, y_train) pred_lasso = feature_sele.predict(x_test) score_lasso = np.sqrt(metrics.mean_squared_error(pred_lasso, y_test)) #svm from sklearn.svm import SVR svr = SVR(kernel='rbf') svr.fit(x_train, y_train) pred_svr = svr.predict(x_test) score_svr = np.sqrt(metrics.mean_squared_error(pred_svr, y_test)) #knn from sklearn.neighbors import KNeighborsRegressor knn = KNeighborsRegressor(weights='distance') knn.fit(x_train, y_train) pred_knn = knn.predict(x_test) score_knn = np.sqrt(metrics.mean_squared_error(pred_knn, y_test))
def random_subset_sampling(num_pts, num_repeats, X, y): indices = list(range(0, X.shape[0])) model = LassoCV(cv=5, verbose=False, eps=1e-5) feat_hist = np.zeros(X.shape[1]) feat_hist_avg = np.zeros(X.shape[1]) aicc_cut = 0.001 # Standardize data y = (y - np.mean(y)) / np.std(y) for i in range(1, X.shape[1]): X[:, i] = (X[:, i] - np.mean(X[:, i])) / np.std(X[:, i]) tot_num_avg = 0 for i in range(num_repeats): print("Sample {} of {}".format(i, num_repeats)) shuffle(indices) indx = np.array(indices[:num_pts]) X_sel = X[indx, :] y_sel = y[indx] model.fit(X_sel, y_sel) nonzero = np.nonzero(model.coef_)[0] print("Num coeff. {}".format(len(nonzero))) feat_hist[nonzero] += 1 aicc_vals = np.zeros(len(model.alphas_)) nonzeros = [] # print(model.alphas_) # print(model.alpha_) # exit() for j in range(len(model.alphas_)): m = Lasso(alpha=model.alphas_[j]) m.fit(X_sel, y_sel) coeff = m.coef_ nonzero = np.nonzero(coeff)[0] pred = m.predict(X_sel) rmse = np.sqrt(np.mean((y_sel - pred)**2)) rss = np.sum((y_sel - pred)**2) if rmse**2 < 1e-12: rmse = 1e-6 #print("RMSE: {:e}".format(rmse)) numCoeff = len(nonzero) nonzeros.append(nonzero) if numCoeff >= X_sel.shape[0] - 1: aicc_vals[j] = 1e100 else: aicc_vals[j] = aicc(numCoeff, X_sel.shape[0], rss) aicc_vals -= np.min(aicc_vals) w = np.exp(-aicc_vals) #print(w) contribute = np.nonzero(w > aicc_cut)[0] print(contribute) for idx in contribute: feat_hist_avg[nonzeros[idx]] += 1.0 tot_num_avg += 1 feat_hist_avg /= tot_num_avg feat_hist /= num_repeats fname = 'random_partion{}.csv'.format(num_pts) np.savetxt(fname, np.vstack((feat_hist_avg, feat_hist)).T, delimiter=',') print("Selection data written to {}".format(fname)) fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.plot(feat_hist_avg, label='Avg') ax.plot(feat_hist, label='Opt.') ax.legend() print("Std avg: {}".format(np.std(feat_hist_avg))) print("Std opt: {}".format(np.std(feat_hist))) plt.show()
scoring='mean_absolute_error', cv=10) ozone_ridgecv_reg = ozone_ridgecv_reg.fit(ozone_train.drop('ozone', axis=1), ozone_train['ozone']) ## Compare regularization models print("Linear Coef: " + str(ozone_ln_reg.coef_) + "\nRidge Coef: " + str(ozone_ridge_reg.coef_) + "\nLasso Coef: " + str(ozone_lasso_reg.coef_) + "\nCV Coef: " + str(ozone_ridgecv_reg.coef_) + "\nCV alpha: " + str(ozone_ridgecv_reg.alpha_)) # Predict using models and evaluate ozone_ln_pred = ozone_ln_reg.predict(ozone_test.drop('ozone', axis=1)) ozone_ridge_pred = ozone_ridge_reg.predict(ozone_test.drop('ozone', axis=1)) ozone_lasso_pred = ozone_lasso_reg.predict(ozone_test.drop('ozone', axis=1)) ozone_ridgecv_pred = ozone_ridgecv_reg.predict(ozone_test.drop('ozone', axis=1)) ## Calculate MAE, RMSE, and R-squared for all models ozone_ln_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ln_pred) ozone_ln_rmse = sqrt( metrics.mean_squared_error(ozone_test['ozone'], ozone_ln_pred)) ozone_ln_r2 = metrics.r2_score(ozone_test['ozone'], ozone_ln_pred) ozone_ridge_mae = metrics.mean_absolute_error(ozone_test['ozone'], ozone_ridge_pred) ozone_ridge_rmse = sqrt( metrics.mean_squared_error(ozone_test['ozone'], ozone_ridge_pred)) ozone_ridge_r2 = metrics.r2_score(ozone_test['ozone'], ozone_ridge_pred)
mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) # # Using regularized methods for regression lasso = Lasso(alpha=0.1) lasso.fit(X_train, y_train) y_train_pred = lasso.predict(X_train) y_test_pred = lasso.predict(X_test) print(lasso.coef_) print('MSE train: %.3f, test: %.3f' % ( mean_squared_error(y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.3f, test: %.3f' % ( r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred))) # Ridge regression:
def lassoCVPath(X, y): model = LassoCV(cv=10, verbose=False, eps=1e-5) model.fit(X, y) fig_path = plt.figure() ax_path = fig_path.add_subplot(1, 1, 1) pred_error = np.mean(np.sqrt(model.mse_path_) * 1000.0, axis=1) min_indx = np.argmin(pred_error) x_ax = np.log10(model.alphas_) ax_path.plot(x_ax, pred_error, color='#7c6868', label="CV") ax_path.axvline(np.log10(model.alphas_[min_indx]), ls='--', color='#7c6868') # Calculate AIC aicc_vals = np.zeros(len(model.alphas_)) bic_vals = np.zeros_like(aicc_vals) nonzeros = [] for i in range(len(model.alphas_)): m = Lasso(alpha=model.alphas_[i]) m.fit(X, y) coeff = m.coef_ nonzero = np.nonzero(coeff)[0] pred = m.predict(X) rss = np.sum((pred - y)**2) if rss < 1e-12: rss = 1e-12 numCoeff = len(nonzero) nonzeros.append(nonzero) print(numCoeff, np.sqrt(rss / len(pred)), model.alphas_[i]) aicc_vals[i] = aicc(numCoeff, X.shape[0], rss) bic_vals[i] = bic(numCoeff, X.shape[0], rss) ax_path2 = ax_path.twinx() ax_path2.plot(x_ax, aicc_vals, color='#b63119', label="AICc") min_indx = np.argmin(aicc_vals) ax_path.axvline(x_ax[min_indx], ls='--', color='#b63119') ax_path2.plot(x_ax, bic_vals, color='#cb9f52', label="BIC") min_indx = np.argmin(bic_vals) ax_path.axvline(x_ax[min_indx], ls='--', color='#cb9f52') ax_path.legend(frameon=False) ax_path.set_ylabel("CV (meV/atom)") ax_path.set_xlabel("log \$\\lambda\$") ax_path2.legend(frameon=False) ax_path2.set_ylabel("AICc/BIC") #ax_path2.set_ylim([-30000, -16500]) fig2 = plt.figure() ax2 = fig2.add_subplot(1, 1, 1) for i, non in enumerate(nonzeros): x = [np.log10(model.alphas_[i]) for _ in range(len(non))] ax2.plot(x, non, ls='none', marker='o', mfc='none', color='#7c6868', markersize=1.5) ax2.spines['right'].set_visible(False) ax2.spines['top'].set_visible(False) ax2.set_xlabel("log \$\\lambda\$") ax2.set_ylabel("Feature no.") plt.show()
def run(self): # Model selection phase internal_k = self._params['internal_k'] # Perform k splits once skf = StratifiedKFold(self._Ytr, n_folds=internal_k) # store the mean accuracies for each model acc_list = np.empty((len(self._params['tau_range']),)) TAU_MAX = self.get_l1_bound() # print("TAU_MAX = {}".format(TAU_MAX)) for i, tau_scaling in enumerate(self._params['tau_range']): tau = TAU_MAX * tau_scaling # print("{}-th value of tau ({})".format(i+1, tau)) acc = 0 # number of solutions which consisted of only zeros # (early stopping for too big tau) N_allzeros = 0 for idx_tr, idx_ts in skf: Xk_tr = self._Xtr[idx_tr, :] Xk_ts = self._Xtr[idx_ts, :] Yk_tr = self._Ytr[idx_tr] Yk_ts = self._Ytr[idx_ts] clf = Lasso(alpha=tau) clf.fit(Xk_tr, Yk_tr) # fit the model # extract only nonzero coefficients selected_features = np.argwhere(clf.coef_).ravel() # print("Selected {} features".format(len(selected_features))) if len(selected_features) == 0: # If no features are selected, just assign all samples to # the most common class (in the training set) N_allzeros += 1 Yk_lr = np.ones((len(Yk_ts),)) * np.sign(Yk_tr.sum() + 0.1) else: # Else, run OLS and get weights for coefficients NOT # affected by shrinking Xk_tr2 = Xk_tr[:, selected_features] Xk_ts2 = Xk_ts[:, selected_features] clf = LinearRegression(normalize=False) clf.fit(Xk_tr2, Yk_tr) # fit the model Yk_lr = clf.predict(Xk_ts2) # predict test data Yk_lr = np.sign(Yk_lr) # take the sign acc += accuracy_score(Yk_ts, Yk_lr) acc_list[i] = acc / internal_k if N_allzeros == internal_k: # All k-fold splits returned empty solutions, stop here as # bigger values of tau would return empty solutions as well print("The {}-th value of tau ({}) returned only empty " "solutions".format(i + 1, tau)) break # Final train with the best choice for tau best_tau_idx = np.argmax(acc_list) # best_tau = self._params['tau_range'][best_tau_idx] best_tau = self._params['tau_range'][best_tau_idx] * TAU_MAX clf = Lasso(alpha=best_tau) clf.fit(self._Xtr, self._Ytr) # fit the model # extract only nonzero coefficients # selected_features = np.argwhere(clf.coef_)[0] selected_features = np.argwhere(clf.coef_).ravel() if len(selected_features) == 0: print("WARNING: the allegedly best solution (tau = {}) was " " empty".format(best_tau)) sign = np.sign(np.sum(self._Ytr) + 0.1) Y_lr = np.ones((len(self._Yts)),) * sign Y_lr_tr = np.ones((len(self._Ytr)),) * sign else: X_tr2 = self._Xtr[:, selected_features] X_ts2 = self._Xts[:, selected_features] clf = LinearRegression() clf.fit(X_tr2, self._Ytr) # fit the model Y_lr = clf.predict(X_ts2) # predict test data Y_lr = np.sign(Y_lr) # take the sign Y_lr_tr = clf.predict(X_tr2) # predict training data Y_lr_tr = np.sign(Y_lr_tr) # take the sign result = {} result['selected_list'] = selected_features # result['beta_list'] = result['beta_list'][0] result['prediction_ts_list'] = Y_lr result['prediction_tr_list'] = Y_lr_tr result['labels_ts'] = self._Yts return result
pred_test_LGB = myLGB.predict(X_test) # Stacking stackedset = pd.DataFrame({'A': []}) stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_l2)], axis=1) stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_l1)], axis=1) stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_GBR)], axis=1) stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_ENet)], axis=1) stackedset = pd.concat([stackedset, pd.DataFrame(pred_test_LGB)], axis=1) # prod = (pred_test_l2*pred_test_l1*pred_test_GBR*pred_test_ENet*pred_test_LGB) ** (1.0/5.0) # stackedset = pd.concat([stackedset,pd.DataFrame(prod)],axis=1) Xstack = np.array(stackedset) Xstack = np.delete(Xstack, 0, axis=1) l1_staked = Lasso(alpha=0.0001, fit_intercept=True) l1_staked.fit(Xstack, y_test) pred_test_stack = l1_staked.predict(Xstack) models.append([l2Regr, l1Regr, myGBR, ENet, myLGB, l1_staked]) # 模型预测 X_score = np.array(df_score) X_score = np.delete(X_score, 0, 1) M = X_score.shape[0] scores_fin = 1 + np.zeros(M) for m in models: ger = m[0] las = m[1] gbr = m[2] Enet = m[3] lgb = m[4] las2 = m[5] ger_predict = ger.predict(X_score)