def bagging(self,trains,tests,train_y,model_name=None): blend_train = trains.T bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True) bclf.fit(blend_train, train_y) y_test_predict = bclf.predict(tests.T) train_predict = bclf.predict(trains.T) return train_predict,y_test_predict
def lassoCV_regression(data,target,alphas): clf=LassoCV() sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(data, target) n_features = sfm.transform(data).shape[1] while n_features > 2: sfm.threshold += 0.1 data_transform = sfm.transform(data) n_features = data_transform.shape[1] rmses=[] kf=KFold(len(target),10,True,None) for train_index, test_index in kf: data_train,data_test=data_transform[train_index],data_transform[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) x0=np.arange(1,11) plt.figure() plt.plot(x0,rmses,label='LassoCV') plt.legend() plt.show() return rmses
def lasso_cv(x, y, x_pred=None, max_deg=3, cv=10, max_iter=1e3, return_model=False): """LASSO polynomial fit with cross-validation. Regularized polynomial regression (by penalized least-squares) from a range of degrees up to n = max_deg. The LASSO regression minimises MSE and penalizes the size of the parameter vector using L1-norm, which leads to fewer coefficients in the fitted model. - The 'alpha' parameter (amount of penalization) is selected by k-fold CV. - Predicts fitted model on given values 'x_pred' (default use 'x'). - Supports NaNs. """ ind, = np.where((~np.isnan(x)) & (~np.isnan(y))) x_, y_ = x[ind], y[ind] X_ = dmatrix('C(x_, Poly)') if x_pred is None: X = dmatrix('C(x, Poly)') # predict on original values else: X = dmatrix('C(x_pred, Poly)') # predict on given values lasso = LassoCV(cv=cv, copy_X=True, normalize=True, max_iter=max_iter) lasso = lasso.fit(X_[:,1:max_deg+1], y_) y_pred = lasso.predict(X[:,1:max_deg+1]) if return_model: y_pred = [y_pred, lasso] return y_pred
def remove_foreground_glm( x, y, spatial_mask=None, spectral_mask=None, alphas=None, l1_ratio=1.): """Summary Args: x (TYPE): Description y (TYPE): Description spatial_mask (TYPE, optional): Description spectral_mask (TYPE, optional): Description alphas (TYPE, optional): Description Returns: TYPE: Description """ # cast to double and reshape x_rs = np.float64(x.reshape((x.shape[0], -1))).T y_rs = np.float64(y.flatten()) if spatial_mask is None: spatial_mask_rs = np.ones_like(y_rs, dtype=bool) else: spatial_mask_rs = spatial_mask.flatten() if spectral_mask is None: spectral_mask = np.ones(x_rs.shape[1], dtype=bool) if alphas is not None: alphas = np.atleast_1d(alphas) # fit GLM if l1_ratio == 1.: reg = LassoCV( positive=True, alphas=alphas, n_jobs=-1, max_iter=5000 ) elif l1_ratio == 0.: reg = RidgeCV( alphas=alphas, ) else: reg = ElasticNetCV( positive=True, alphas=alphas, n_jobs=-1, l1_ratio=l1_ratio ) reg.fit(x_rs[spatial_mask_rs][:, spectral_mask], y_rs[spatial_mask_rs]) y_model = reg.predict(x_rs[:, spectral_mask]).reshape(y.shape) glm_coeffs = np.zeros(x_rs.shape[1], dtype=np.float32) glm_coeffs[spectral_mask] += reg.coef_ return y_model, reg, glm_coeffs
def predict(self,trains_x,train_y,tests_x,parameters,times=10,isFile=True,foldername="blend-dir"): """ Ensamble many features and regression :params train_X: dictionary for training :params train_y: testing vector """ #parameter_get test_data_sample = tests_x.values()[0] if not os.path.exists(foldername): os.makedirs(foldername) skf = None kfold_file = foldername + "/kfold_index.pkl" if os.path.exists(kfold_file): skf = pickle.load(open(kfold_file,"r")) else: skf = KFold(n=len(train_y),n_folds=times,shuffle=True) pickle.dump(skf,open(kfold_file,"w")) blend_train = np.zeros((len(train_y),len(parameters))) blend_test = np.zeros((len(test_data_sample),len(parameters))) for j,parameter in enumerate(parameters): train_x = trains_x[parameter['data']] test_x = tests_x[parameter['data']] blend_test_tmp = np.zeros((len(test_data_sample),len(parameters))) #file path check for i, (train_index,valid_index) in enumerate(skf): clf = model_select(parameter['parameter']) train = train_x[train_index] train_valid_y = train_y[train_index] kfold_filepath = "./" + foldername + "/parameter_{}_kfold_{}.pkl".format(j,i) if os.path.exists(kfold_filepath): blend_train_prediction,blend_test_prediction = pickle.load(open(kfold_filepath,"r")) blend_train[train_index,j] = np.expm1(clf.predict(train)) blend_test_tmp[:,i] = np.expm1(clf.predict(test_x)) else: clf.fit(train,np.log1p(train_valid_y)) blend_train_prediction = np.expm1(clf.predict(train)) blend_test_prediction = np.expm1(clf.predict(test_x)) pickle.dump((blend_train_prediction,blend_test_prediction),open(kfold_filepath,"w")) blend_train[train_index,j] = blend_train_prediction blend_test_tmp[:,i] = blend_test_prediction blend_test[:,j] = blend_test_tmp.mean(1) #Blending Model bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True) bclf.fit(blend_train, train_y) y_test_predict = bclf.predict(blend_test) return y_test_predict
def fit_Lasso(features_train, labels_train, features_pred): model = LassoCV() model.fit(features_train, labels_train) mse = model.mse_path_ print "LASSO - Mean square error: ", mse.shape # Test the model labels_pred = model.predict(features_pred) return labels_pred
def make_model_and_predict(train_file, test_file): """Given name of training csv file, name of test csv file, constructs a random forest model and outputs predictions to a time-stampled csv file. If the test_file has SalaryNormalized as an attribute, it will score the model and write the result in the file "score<datetime>" """ train = pd.read_csv(train_file) valid = pd.read_csv(test_file) number_of_word_features = 200 title_words = count_words_in_column(train, "Title") key_count_pairs = [(k,v) for (k,v) in title_words.items() if k not in stopwords.words('english')] key_count_pairs.sort(key=lambda (k,v): -v) for word, count in key_count_pairs[:number_of_word_features]: add_appearance_count_feature(train, word, "Title") add_appearance_count_feature(valid, word, "Title") group_features = ["LocationNormalized", "Category", "Company", "SourceName"] for f in group_features: continuize_feature(train, valid, f, "SalaryNormalized") feature_columns = train.columns[12:] feature=train[feature_columns] label=train.SalaryNormalized clf = LassoCV() clf.fit(feature, label) valid_salary_predict = clf.predict(valid[feature_columns]) valid["SalaryNormalized_Predict"] = valid_salary_predict date_string = re.sub("[ :.]", "", str(datetime.datetime.now())) predict_filename = 'predict' + date_string + '.csv' score_filename = 'score' + date_string + '.txt' with open(predict_filename,'wb') as f: valid[["Id","SalaryNormalized_Predict"]].to_csv(f, index=False, header=False) ##Computes average RMS error and writes score to file if hasattr(valid, 'SalaryNormalized'): score = 0 for i,_ in enumerate(valid["SalaryNormalized_Predict"]): score += (valid.SalaryNormalized[i] - valid.SalaryNormalized_Predict[i]) **2 score = math.sqrt(score/len(valid["SalaryNormalized_Predict"])) with open (score_filename, 'wb') as f: f.write("Train: " + train_file + "\n") f.write("Test: " + test_file + "\n") f.write("Score: " + str(score) + "\n")
def get_model_per_cluster(X, Y): model_per_cluster = {} for c in X.cluster.unique(): X_cluster = X[X.cluster==c] Y_true = Y[Y.cluster == c].ALSFRS_slope regr = LassoCV(cv=5) regr.fit(X_cluster, Y_true) print 'cluster: %d size: %s' % (c, Y_true.shape) Y_predict = regr.predict(X_cluster) print "\t RMS error (0 is perfect): %.2f" % np.sqrt(np.mean( (Y_predict - Y_true) ** 2)) regression_SS = ((Y_predict - Y_true) ** 2).sum() residual_SS =((Y_true - Y_true.mean()) ** 2).sum() print '\t coefficient of determination R^2 = %.2f ' % (1.0 - regression_SS/residual_SS) # regr.score(X_cluster, Y_true) cov = sum((Y_predict - Y_predict.mean())*(Y_true - Y_true.mean())) Y_predict_std = np.sqrt(sum((Y_predict - Y_predict.mean())**2)) Y_true_std = np.sqrt(sum((Y_true - Y_true.mean())**2)) print '\t pearson correlation r = %.2f ' % (cov/(Y_predict_std*Y_true_std)) # scipy.stats.pearsonr(Y_predict, Y_true)[0] print "3 sample predictions: ", regr.predict(X_cluster)[:3] model_per_cluster[c] = {"cluster_train_data_means": X_cluster.mean(), "model" : regr} return model_per_cluster
def lassoRegularization(X,Y): """ :param X: data consisting of features (excluding class variable) :param Y: column vector consisting of class variable :return: report best RMSE value for lasso regularization """ tuningAlpha = [0.1,0.01,0.001] lasso = LassoCV(normalize=True, alphas=tuningAlpha, cv=10) lasso.fit(X,Y) prediction = lasso.predict(X) print print "LASSO REGULARIZATION" print "Best Alpha value for Lasso Regularization : " + str(lasso.alpha_) print 'Best RMSE for corresponding Alpha =', np.sqrt(mean_squared_error(Y, prediction))
class LocalRegression: """This class implements "local" regression. Given a set of training data and a set of unknown data, iterate through each unknown spectrum, find the nearest training spectra, and generate a model. Each of these local models is optimized using built-in cross validation methods from scikit.""" def __init__(self, params, n_neighbors = 250): """Initialize LocalRegression Arguments: params = Dict containing the keywords and parameters for the regression method to be used. Keyword arguments: n_neighbors = User-specified number of training spectra to use to generate the local regression model for each unknown spectrum. """ self.model = LassoCV(**params) # For now, the only option is LASSO. Other methods to be added in the future # params is a dict containing the keywords and parameters for LassoCV self.neighbors = NearestNeighbors(n_neighbors=n_neighbors) def fit_predict(self,x_train,y_train, x_predict): """Use local regression to predict values for unknown data. Arguments: x_train = The training data spectra. y_train = The values of the quantity being predicted for the training data x_predict = The unknown spectra for which y needs to be predicted. """ self.neighbors.fit(x_train) predictions = [] coeffs = [] intercepts = [] for i in range(x_predict.shape[0]): print('Predicting spectrum ' + str(i + 1)) x_temp = np.array(x_predict[i]) foo, ind = self.neighbors.kneighbors([x_temp]) x_train_local = np.squeeze(x_train[ind]) y_train_local = np.squeeze(y_train[ind]) cv = GroupKFold(n_splits=3) cv = cv.split(x_train_local, y_train_local, groups=y_train_local) self.model.fit(x_train_local, y_train_local) predictions.append(self.model.predict([x_temp])[0]) coeffs.append(self.model.coef_) intercepts.append(self.model.intercept_) return predictions, coeffs, intercepts
def lassocvclassifier(training_samples, eval_samples, vectorizer, do_grid_search=False): X_train, Y_train = training_samples X_eval, Y_eval = eval_samples #clf = SGDClassifier(loss='log', penalty= 'l2',l1_ratio=0.0, n_iter=30, shuffle=True, verbose=False, # n_jobs=4, alpha=1e-4, average=True, class_weight=None) clf = LassoCV() clf.fit(X_train, Y_train) #y_train_true, y_train_pred = Y_train, clf.predict(X_train) print_top_10_words = True scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss') print scores, np.mean(scores), np.median(scores) print(clf) #scores = cross_validation.cross_val_score(clf.best_estimator_, X_train, Y_train, cv=10, scoring='log_loss') #print scores, np.mean(scores), np.median(scores) y_true, y_pred = Y_eval, clf.predict(X_eval) y_prob = clf.predict_proba(X_eval)
test_data["casual_log"], feature_engg_linreg_model.predict(test_data.drop(target, axis=1)) ) # Not much difference? > Doesn't look like we are overfitting! # But how to perform shrinkage/penalized regression in general? from sklearn.linear_model import LassoCV feature_engg_lassocv_model = LassoCV(max_iter=50, cv=3, n_jobs=-1, random_state=42) feature_engg_lassocv_model.fit(train_data.drop(target, axis=1), train_data["casual_log"]) feature_engg_lassocv_mse_train = metrics.mean_squared_error( train_data["casual_log"], feature_engg_lassocv_model.predict(train_data.drop(target, axis=1)) ) feature_engg_lassocv_mse_test = metrics.mean_squared_error( test_data["casual_log"], feature_engg_lassocv_model.predict(test_data.drop(target, axis=1)) ) # Check the performance on test set print feature_engg_linreg_mse_test print feature_engg_lassocv_mse_test # Penalization decreases performance? # Compare coefficients with non penalized model print feature_engg_linreg_model.coef_[1:10] print feature_engg_lassocv_model.coef_[1:10]
print("Try again for more precision with alphas centered around " + str(alpha)) lasso = LassoCV(alphas=[ alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4 ], max_iter=50000, cv=10) lasso.fit(X_train, y_train) alpha = lasso.alpha_ print("Best alpha :", alpha) print("Lasso RMSE on Training set :", rmse_cv_train(lasso).mean()) print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean()) y_train_las = lasso.predict(X_train) y_test_las = lasso.predict(X_test) # Plot residuals plt.scatter(y_train_las, y_train_las - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_las, y_test_las - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear regression with Lasso regularization") plt.xlabel("Predicted values")
print(y_test) # train alpha = [0.01, 0.1, 1, 10, 100, 1000] lasso = LassoCV(alphas=alpha, cv=5) lasso.fit(x_train, y_train) # alpha alpha = lasso.alpha_ print('best alpha is : ' + str(alpha)) # test y_train_pred = lasso.predict(x_train) y_test_pred = lasso.predict(x_test) # calculate rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred)) rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred)) print('RMSE of training dataset is : ' + str(rmse_train)) print('RMSE of test dataset is : ' + str(rmse_test)) # print(y_test_pred) # plot # y_test_pred_mean = y_test_pred.mean() # y_test_pred_std = y_test_pred.std()
# What does our prediction error look like? from yellowbrick.regressor import PredictionError prederr = PredictionError(lasso) prederr.fit(Xtrain, ytrain) prederr.score(Xtest, ytest) g = prederr.poof() # Next, we pull out our fitted values (yhat) and actuals (ytest) to see how they compare. # We also calculate our residuals by subtracting our fitted values from the actuals. import matplotlib.pyplot as plt lasso.fit(Xtrain, ytrain) yhat = lasso.predict(Xtest) error = ytest - yhat data = pd.DataFrame({'t': range(1, len(yhat) + 1), 'ytest': ytest, 'yhat': yhat, 'error': error}) plt.plot('t', 'ytest', data=data, color='blue', linewidth=1, label='actual') plt.plot('t', 'yhat', data=data, color='orange', marker='o', linestyle="None", label='predicted', alpha=0.5) plt.plot('t', 'error', data=data, color='gray') plt.legend() plt.show() # Pickle model from sklearn.externals import joblib
#print('\nTarget on train data',predict_train) # ## Accuray Score on train dataset #accuracy_train = accuracy_score(train_y,predict_train) #print('accuracy_score on train dataset : ', accuracy_train) # ## predict the target on the test dataset #predict_test = model.predict(test_x) #print('Target on test data',predict_test) # ## Accuracy Score on test dataset #accuracy_test = accuracy_score(test_y,predict_test) #print('accuracy_score on test dataset : ', accuracy_test) ########################################################################### ############# LINEAR REGRESSION #################################### ########################################################################### reg = LinearRegression() reg.fit(X_train, Y_train) reg.score(X_test, Y_test) from sklearn import ensemble clf = ensemble.GradientBoostingRegressor(n_estimators=400, max_depth=5, min_samples_split=2, learning_rate=0.1, loss='ls') clf.fit(X_train, Y_train) clf.score(X_test, Y_test) y_pred = reg.predict(X_test)
rmse_cv(model_lasso).mean() coef = pd.Series(model_lasso.coef_, index = X_train.columns) print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)]) matplotlib.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef.plot(kind = "barh") plt.title("Coefficients in the Lasso Model") #let's look at the residuals as well: matplotlib.rcParams['figure.figsize'] = (6.0, 6.0) preds = pd.DataFrame({"preds":model_lasso.predict(X_train), "true":y}) preds["residuals"] = preds["true"] - preds["preds"] preds.plot(x = "preds", y = "residuals",kind = "scatter") preds = np.expm1(model_lasso.predict(X_test)) #Adding an xgboost model import xgboost as xgb dtrain = xgb.DMatrix(X_train, label = y) dtest = xgb.DMatrix(X_test) params = {"max_depth":2, "eta":0.1} model = xgb.cv(params, dtrain, num_boost_round=500, early_stopping_rounds=100) model.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot()
plt.ylim(ymin, ymax) ##get the coeff for each column coef = pd.Series(model_lasso.coef_, index = X_train1.columns).sort_values(ascending=False) plt.figure(figsize=(10, 5)) coef.head(20).plot(kind='bar') plt.title('Feature Importance in Lasso Model') plt.tight_layout() imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)]) plt.figure(figsize=(8, 10)) imp_coef.plot(kind = "barh") plt.title("Coefficients in Lasso Model") plt.tight_layout() preds_lasso = pd.DataFrame({"preds":model_lasso.predict(X_test1), "true":y_test1}) preds_lasso["residuals"] = preds_lasso["true"] - preds_lasso["preds"] preds_lasso["residuals"].abs().mean() #0.0807606 preds_lasso.plot(x = "preds", y = "residuals",kind = "scatter") plt.figure(figsize=(10, 5)) plt.scatter(y_test1, preds_lasso["preds"], s=20) plt.title('Predicted vs. Actual') plt.xlabel('Actual Sale Price') plt.ylabel('Predicted Sale Price') plt.plot([min(y_test1), max(y_test1)], [min(y_test1), max(y_test1)]) plt.tight_layout() #####xgboost model########## import xgboost as xgb
def lasso_model(data, y): X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.3, random_state=0) scorer = make_scorer(mean_squared_error, greater_is_better=False) def rmse_cv_train(model): rmse = np.sqrt( -cross_val_score(model, X_train, y_train, scoring=scorer, cv=10)) return (rmse) def rmse_cv_test(model): rmse = np.sqrt( -cross_val_score(model, X_test, y_test, scoring=scorer, cv=10)) return (rmse) lasso = LassoCV(alphas=[ 0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1 ], max_iter=50000, cv=10) lasso.fit(X_train, y_train) alpha = lasso.alpha_ print("Best alpha :", alpha) print("Try again for more precision with alphas centered around " + str(alpha)) lasso = LassoCV(alphas=[ alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4 ], max_iter=50000, cv=10) lasso.fit(X_train, y_train) alpha = lasso.alpha_ print("Best alpha :", alpha) print("Lasso RMSE on Training set :", rmse_cv_train(lasso).mean()) print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean()) y_train_las = lasso.predict(X_train) y_test_las = lasso.predict(X_test) # Plot residuals plt.scatter(y_train_las, y_train_las - y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_las, y_test_las - y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear regression with Lasso regularization") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc="upper left") plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red") plt.show() # Plot predictions plt.scatter(y_train_las, y_train, c="blue", marker="s", label="Training data") plt.scatter(y_test_las, y_test, c="lightgreen", marker="s", label="Validation data") plt.title("Linear regression with Lasso regularization") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc="upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c="red") plt.show() # Plot important coefficients coefs = pd.Series(lasso.coef_, index=X_train.columns) print("Lasso picked " + str(sum(coefs != 0)) + " features and eliminated the other " + \ str(sum(coefs == 0)) + " features") imp_coefs = pd.concat( [coefs.sort_values().head(10), coefs.sort_values().tail(10)]) imp_coefs.plot(kind="barh") plt.title("Coefficients in the Lasso Model") plt.show() return lasso
# Entrenamos el modelo. xgbReg.fit(x_train, y_train) # Calculamos el error. print("xgb score:") get_score(prediction=xgbReg.predict(x_train), lables=y_train) y_pred_xgb = xgbReg.predict(x_test) # In[72]: model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(x_train, y_train) #model_lasso = Lasso(alpha=0.00099, max_iter=50000) #model_lasso.fit(x_train_st,y_train) get_score(prediction=model_lasso.predict(x_train), lables=y_train) y_pred_lasso = model_lasso.predict(x_test) # In[78]: model_elastic = ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train, y_train) get_score(prediction=model_elastic.predict(x_train), lables=y_train) y_pred_elastic = model_elastic.predict(x_test) y_pred = (y_pred_xgb * 0.3 + y_pred_lasso * 0.4 + y_pred_elastic * 0.3 ) # submission-31-12-4 # y_pred = (y_pred_xgb*0.3 + y_pred_lasso*0.45 + y_pred_elastic*0.25) # submission-31-12-5 # y_pred = (y_pred_xgb + y_pred_elastic+ y_pred_lasso)/3 # submission-31-12-6 y_pred = np.exp(y_pred)
print coef_path_forest_cv.get_params print coef_path_forest_cv.feature_importances_ forest_prediction = coef_path_forest_cv.predict(X) forest_score = coef_path_forest_cv.score(X,y) print "Forest_score:%.3g" % forest_score forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, y, n_jobs=2, cv=5) print forest_cv_score print "########LASSO######" coef_path_lasso_cv.fit(X,y) print coef_path_lasso_cv.get_params print "alphas:" print coef_path_lasso_cv.alphas_ print "coef_:" print coef_path_lasso_cv.coef_ lasso_prediction = coef_path_lasso_cv.predict(X) lasso_score = coef_path_lasso_cv.score(X,y) print "Lasso_score:%.3g" % lasso_score #print "Lasso precision:%.3g" % precision_score(y, lasso_predict) #print "Lasso_confusion matrix:" #print confusion_matrix(y, lasso_prediction) lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=5) print lasso_cv_score plt.figure() plt.hist2d(y, lasso_prediction) plt.ylabel("Predicted Values") plt.xlabel("Truth Values") plt.title("Lasso Linear Regression") plt.savefig("figures/lasso_predicted_truth.png") print "#######ELASTIC#####" coef_path_elastic_cv.fit(X,y)
test = pd.read_csv('test.csv') ids = test['User_ID'].values pid = test['Product_ID'].values features_test = test.drop('User_ID',1) le = LabelEncoder() print("assuming text variables are categorical & replacing them with numeric ids\n") for c in featureNames: # features_train[c] = features_train[np.isnan(features_train[c])] = -1 # features_test[c] = features_test[np.isnan(features_test[c])] = -1 if features_train[c].dtype.name == 'object': le.fit(np.append(features_train[c],features_test[c])) features_train[c] = le.transform(features_train[c]).astype(int) features_test[c] = le.transform(features_test[c]).astype(int) features_train = features_train.fillna(0) features_test = features_test.fillna(0) dhackLassoModel = LassoCV(cv = 10).fit(features_train,labels_train) pred = dhackLassoModel.predict(features_test) submission = pd.DataFrame({"User_ID": ids, "Product_ID": pid, "Purchase": pred}) #print out the value of alpha that minimizes the CV-error print("alpha Value that Minimizes CV Error ",dhackLassoModel.alpha_) print("Minimum MSE ", min(dhackLassoModel.mse_path_.mean(axis=-1))) submission.to_csv("submissionLassocv.csv", index=False)
class RuleFit(BaseEstimator, TransformerMixin): """Rulefit class Parameters ---------- tree_generator: object GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None) Attributes ---------- rule_ensemble: RuleEnsemble The rule ensemble feature_names: list of strings, optional (default=None) The names of the features (columns) """ def __init__(self, tree_generator=None): self.tree_generator = tree_generator def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ ## Enumerate features if feature names not provided if feature_names is None: self.feature_names = ['feature_' + str(x) for x in range(0, X.shape[1])] else: self.feature_names=feature_names ## initialise tree generator if self.tree_generator is None: self.tree_generator = GradientBoostingRegressor() if type(self.tree_generator) not in [GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor, RandomForestClassifier]: raise ValueError("RuleFit only works with RandomForest and BoostingRegressor") ## TODO: Error if tree generator not GB nor RF ## fit tree generator self.tree_generator.fit(X, y) tree_list = self.tree_generator.estimators_ if isinstance(self.tree_generator, RandomForestRegressor) or isinstance(self.tree_generator, RandomForestClassifier): tree_list = [[x] for x in self.tree_generator.estimators_] ## extract rules self.rule_ensemble = RuleEnsemble(tree_list = tree_list, feature_names=self.feature_names) ## concatenate original features and rules X_rules = self.rule_ensemble.transform(X) ## No rules found if X_rules.shape[0] == 0: X_concat = X else: X_concat = np.concatenate((X, X_rules), axis=1) ## initialise Lasso self.lscv = LassoCV() ## fit Lasso self.lscv.fit(X_concat, y) return self def predict(self, X): """Predict outcome for X """ X_rules = self.rule_ensemble.transform(X) X_concat = np.concatenate((X, X_rules), axis=1) return self.lscv.predict(X_concat) def transform(self, X=None, y=None): """Transform dataset. Parameters ---------- X : array-like matrix, shape=(n_samples, n_features) Input data to be transformed. Use ``dtype=np.float32`` for maximum efficiency. Returns ------- X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ return self.rule_ensemble.transform(X) def get_rules(self, exclude_zero_coef=True): """Return the estimated rules Parameters ---------- exclude_zero_coef: If True (default), returns only the rules with an estimated coefficient not equalt to zero. Returns ------- rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds the coefficients and 'support' the support of the rule in the training data set (X) """ n_features= len(self.lscv.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) output_rules = [] ## Add coefficients for linear effects for i in range(0, n_features): output_rules += [(self.feature_names[i], 'linear', self.lscv.coef_[i], 1)] ## Add rules for i in range(0, len(self.rule_ensemble.rules)): rule = rule_ensemble[i] output_rules += [(rule.__str__(), 'rule', self.lscv.coef_[i + n_features], rule.support)] rules = pd.DataFrame(output_rules, columns=["rule", "type","coef", "support"]) if exclude_zero_coef: rules = rules.ix[rules.coef != 0] return rules
print "2.5.b) Lambda déterminé par LassoCV:", clf.alpha_ # plot lasso path with CV choice ax1.axvline(clf.alpha_, color='K',linestyle='-', linewidth= 3) plt.annotate('CV', xy=(1.1*clf.alpha_,0.2), xycoords='data', xytext=(0, 0), textcoords='offset points', fontsize=18) plt.show(block=False) filename="lassoCV" image_name=dirname+filename+imageformat fig2.savefig(image_name) ################------ Exercice 2.5c ------############################### xnew = [6,0.3,0.2,6,0.053,25,149,0.9934,3.24,0.35,10] scorenew = clf.predict(xnew) print "2.5.c) Prédiction de score pour xnew = \ [6,0.3,0.2,6,0.053,25,149,0.9934,3.24,0.35,10]: ", scorenew ################------ Exercice 2.5d ------############################### mymodel = linear_model.LinearRegression(fit_intercept=False) mymodel.fit(X_cr,y_cr) print "coefs des MCOs" #print ([mymodel.coef_ , mymodel.intercept_]) print (mymodel.coef_) theta_LR = mymodel.coef_ # mymodel = linear_model.LinearRegression(fit_intercept=True)
# - [LassoCV](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html): lasso regression with built-in cross-validation of the alpha parameter # - **n_alphas:** number of alpha values (automatically chosen) to try # select the best alpha with LassoCV from sklearn.linear_model import LassoCV lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1) lassoregcv.fit(X_train, y_train) lassoregcv.alpha_ # examine the coefficients print lassoregcv.coef_ # predict method uses the best alpha value y_pred = lassoregcv.predict(X_test) print np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # ## Part 5: Regularized classification in scikit-learn # # - Wine dataset from the UCI Machine Learning Repository: [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data), [data dictionary](http://archive.ics.uci.edu/ml/datasets/Wine) # - **Goal:** Predict the origin of wine using chemical analysis # ### Load and prepare the wine dataset # read in the dataset url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data' wine = pd.read_csv(url, header=None) wine.head()
class RuleFit(BaseEstimator, TransformerMixin): """Rulefit class Parameters ---------- tree_generator: object GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None) Attributes ---------- rule_ensemble: RuleEnsemble The rule ensemble feature_names: list of strings, optional (default=None) The names of the features (columns) """ def __init__(self, tree_generator=None): self.tree_generator = tree_generator def fit(self, X, y=None, feature_names=None): """Fit and estimate linear combination of rule ensemble """ self.feature_names=feature_names ## initialise tree generator if self.tree_generator is None: self.tree_generator = GradientBoostingRegressor() if type(self.tree_generator) not in [GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor, RandomForestClassifier]: raise ValueError("RuleFit only works with RandomForest and BoostingRegressor") ## TODO: Error if tree generator not GB nor RF ## fit tree generator self.tree_generator.fit(X, y) tree_list = self.tree_generator.estimators_ if isinstance(self.tree_generator, RandomForestRegressor) or isinstance(self.tree_generator, RandomForestClassifier): tree_list = [[x] for x in self.tree_generator.estimators_] ## extract rules self.rule_ensemble = RuleEnsemble(tree_list = tree_list, feature_names=self.feature_names) ## concatenate original features and rules X_rules = self.rule_ensemble.transform(X) ## No rules found if X_rules.shape[0] == 0: X_concat = X else: X_concat = np.concatenate((X, X_rules), axis=1) ## initialise Lasso self.lscv = LassoCV() ## fit Lasso self.lscv.fit(X_concat, y) return self def predict(self, X): """Predict outcome for X """ X_rules = self.rule_ensemble.transform(X) X_concat = np.concatenate((X, X_rules), axis=1) return self.lscv.predict(X_concat) def transform(self, X=None, y=None): """Transform dataset. Parameters ---------- X : array-like matrix, shape=(n_samples, n_features) Input data to be transformed. Use ``dtype=np.float32`` for maximum efficiency. Returns ------- X_transformed: matrix, shape=(n_samples, n_out) Transformed data set """ return self.rule_ensemble.transform(X) def get_rules(self): n_features= len(self.lscv.coef_) - len(self.rule_ensemble.rules) rule_ensemble = list(self.rule_ensemble.rules) output_rules = [] for i in range(0, len(self.rule_ensemble.rules) - 1): rule = rule_ensemble[i] output_rules += [(rule.__str__(), self.lscv.coef_[i + n_features], rule.support)] return pd.DataFrame(output_rules, columns=["rule", "coef", "support"])
msg("Fitting!") weights = np.ones(train.shape[0]) do_statsmodels = True if do_statsmodels: ols = sm.wls(formula=formula, data=train, weights=weights).fit() print(ols.summary()) msg("Making predictions for all playergames") yy_df['ols_prediction'] = ols.predict(yy_df) else: ols_lr = LassoCV(n_jobs=-1, verbose=True) X = train[rhs_cols] y = train['elo'] ols_lr.fit(X, y) yy_df['ols_prediction'] = ols_lr.predict(X) yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs() yy_df['training'] = (yy_df['gamenum'] % 3) insample_scores = yy_df.groupby('training')['ols_error'].agg({ 'mean': np.mean, 'median': np.median, 'stdev': np.std }) print(insample_scores) msg("Error summary by ELO:") elo_centuries = cut(yy_df['elo'], 20) print( yy_df.groupby(elo_centuries)['ols_error'].agg({ 'sum': np.sum,
pd.options.display.max_rows = 1999 df_train = pd.read_csv("train.csv") df_test = pd.read_csv("test.csv") df_list = [df_train, df_test] df_final = pd.concat(df_list) df_train_age = df_final[~np.logical_or(df_final['Age'].isnull(), df_final['Fare'].isnull())].copy() df_train_age.loc[df_train_age['Fare'].isnull(), 'Fare'] = df_train_age['Fare'].dropna().median() df_train_age['Sex'] = df_train_age['Sex'].map({'female': 0, 'male': 1}).astype(int) df_train_age.head(10) predictors = ['Sex', 'SibSp', 'Parch', 'Fare'] print(df_train_age[predictors]) print('==================') print(df_train_age[df_train_age['Age'].isnull()]) model = LassoCV(cv=10).fit(df_train_age[predictors], df_train_age['Age']) df_test = pd.read_csv("test.csv") df_test.loc[df_test['Fare'].isnull(), 'Fare'] = df_test['Fare'].dropna().median() df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int) df_test['AgeFill'] = model.predict(df_test[predictors]) print(df_test[['Name', 'Sex', 'Age', 'AgeFill']])
sample[fold_size * 2:fold_size * 3], sample[fold_size * 3:fold_size * 4], sample[fold_size * 4:fold_size * 5], sample[fold_size * 5:fold_size * 6], sample[fold_size * 6:fold_size * 7], sample[fold_size * 7:fold_size * 8], sample[fold_size * 8:fold_size * 9], sample[fold_size * 9:] ] for fold in tenFold_sample: test = fold train = [s for s in sample if s not in test] y_train = com_P2T_df.loc[train, :][column] x_train = com_comb_df.loc[train, :][lasso_otusPairs[column]] model = LinearRegression().fit(x_train, y_train) x_test = com_comb_df.loc[test, :][lasso_otusPairs[column]] P2T_pred_df.loc[test, column] = model.predict(x_test) P2T_pred_df.fillna(0, inplace=True) # pearson and spearman correlation from scipy import stats index_names = P2T_pred_df.columns.values correlation_df = pd.DataFrame(index=index_names, columns=[ 'pearsonCor', 'pearsonPvalue', 'spearmanCor', 'spearmanPvalue', 'coef_num', 'zeroCoef_num' ]) #print(correlation_df.head()) for column in P2T_pred_df:
def stacking_train(x_train,y_train,list_index1,list_index2,x): k_fold=len(list_index1) m=5 #stacking构成训练集合的特征数 k_fold_train_num=486 #k_fold 中每一层的样本数 n_train=len(x_train)#获取stacking构成训练集合的样本数 data_stacking_train=pd.DataFrame(np.zeros((n_train,m))) data_stacking_submit=pd.DataFrame(np.zeros((len(x),m))) models=[LassoCV(), RidgeCV(), GradientBoostingRegressor(n_estimators=150,max_depth=3), XGBRegressor(n_estimators=470,max_depth=2), BaggingRegressor(base_estimator=LassoCV(),n_estimators=50, max_samples=0.6, max_features=0.8,), ] for i in list(range(k_fold)):#将各层训练集合,通过第一层的模型训练预测为stacking训练集合中生成特征元素 pred=[] pred_test=[] pred_submit=[] index1=list_index1[i] index2=list_index2[i] for model in models: for n in index1: if n not in list(y_train.index): print('no',n) model.fit(x_train.iloc[index1,:],y_train.iloc[index1,:]) pred.append(model.predict(x_train.iloc[index2,:])) pred_submit.append(model.predict(x)) for j in range(k_fold_train_num):#将第一层的预测值作为特征值赋值给新构造的stacking训练集合--------------------- b=index2[0] data_stacking_train.iloc[j+b,0]=pred[0][j] data_stacking_train.iloc[j+b,1]=pred[1][j] data_stacking_train.iloc[j+b,2]=pred[2][j] data_stacking_train.iloc[j+b,3]=pred[3][j] data_stacking_train.iloc[j+b,4]=pred[4][j] for k in range(len(x)):#用第一层的模型来给submit集合生成特征元素----------------------------------- data_stacking_submit.iloc[k,0]+=pred_submit[0][k]/k_fold data_stacking_submit.iloc[k,1]+=pred_submit[1][k]/k_fold data_stacking_submit.iloc[k,2]+=pred_submit[2][k]/k_fold data_stacking_submit.iloc[k,3]+=pred_submit[3][k]/k_fold data_stacking_submit.iloc[k,4]+=pred_submit[4][k]/k_fold print(i) ######### 利用data——stacking——train 进行第二层模型的训练 model_2=LassoCV() model_2.fit(data_stacking_train,y_train) return (model_2.predict(data_stacking_submit))
# prepare features X_train = alldata.iloc[:train.shape[0], :] X_test = alldata.iloc[train.shape[0]:, :] # prepare target train['SalePrice_log'] = np.log1p(train['SalePrice']) y = train.SalePrice_log # model alphas = np.linspace(0.0001, 0.001, 100) cv = 5 model_lasso = LassoCV(alphas=alphas, cv=5) res = model_lasso.fit(X_train, y) score = cross_val_score(model_lasso, X_train, y, cv=cv).mean() coef = pd.Series(model_lasso.coef_, index=X_train.columns) print 'Lasso has chosen alpha to be %f.' % (res.alpha_) print 'The cross validation score is %f.' % (score) # plot the most significant 10 features plot_import_vars(coef, 5) # prediction preds = np.expm1(model_lasso.predict(X_test)) solution = pd.DataFrame({'id': test.Id, 'SalePrice': preds}) solution.to_csv('house_price.csv', index=False) # to do: # fill missing value by distribution # create new features # more samples
train_num = len(train) all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'], test.loc[:,'MSSubClass':'SaleCondition'])) #log transform the target: train["SalePrice"] = np.log1p(train["SalePrice"]) #log transform skewed numeric features: numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness skewed_feats = skewed_feats[skewed_feats > 0.75] skewed_feats = skewed_feats.index all_data[skewed_feats] = np.log1p(all_data[skewed_feats]) all_data = pd.get_dummies(all_data) #filling NA's with the mean of the column: all_data = all_data.fillna(all_data.mean()) X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y) print 'generating file' y_test_pred = model_lasso.predict(X_test) submission = pd.DataFrame({"Id": test["Id"],"SalePrice": y_test_pred}) submission.loc[submission['SalePrice'] <= 0, 'SalePrice'] = 0 fileName = "submission_.csv" submission.to_csv(fileName, index=False)
def Lasso_Mode(X_train, y_train, X_test, y_test, num_class): algo_name = 'Lasso Regression' lasso_model = LassoCV(alphas=[0.01, 0.05, 0.10, 0.20, 0.50, 1]) lasso_model.fit(X_train, y_train) y_pred_lm = lasso_model.predict(X_test) PRAF(y_test, y_pred_lm, num_class, algo_name)
df.index = range(1994, 2014) df.loc[2014] = None df.loc[2015] = None l = ['x1', 'x2', 'x3', 'x4', 'x5', 'x7'] for i in l: f = GM11(df[i][list(range(1994, 2014))].values)[0] df[i][2014] = f(len(df) - 1) df[i][2015] = f(len(df)) df[i] = df[i].round(2) features = ['x1', 'x2', 'x3', 'x4', 'x5', 'x7'] train = df.loc[list(range(1994, 2014)), features + ['y']].copy() scaler = StandardScaler() train = scaler.fit_transform(train) x_train = train[:, :-1] y_train = train[:, -1] model = Sequential() model.add(Dense(12, input_shape=(6, ))) model.add(Activation('relu')) model.add(Dense(1, input_shape=(12, ))) model.compile(loss='mean_squared_error', optimizer='adam') model.fit(x_train, y_train, epochs=10000, batch_size=16) model.save_weights('1-net.model') x = (df[features] - scaler.mean_[:-1]) / scaler.scale_[:-1] df['y_pred'] = model.predict(x) * scaler.scale_[-1] + scaler.mean_[-1] df[['y', 'y_pred']].plot(subplots=True, style=['b-o', 'r-*']) plt.show()
X_train = all_data[:train.shape[0]] X_test = all_data[train.shape[0]:] y = train.SalePrice print y def rmse_cv(model): rmse = np.sqrt(-cross_val_score( model, X_train, y, scoring="neg_mean_squared_error", cv=5)) return (rmse) model_ridge = Ridge() alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75] cv_ridge = [rmse_cv(Ridge(alpha=alpha)).mean() for alpha in alphas] cv_ridge = pd.Series(cv_ridge, index=alphas) cv_ridge.plot(title="Validation - Just Do It") plt.xlabel("alpha") plt.ylabel("rmse") model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(X_train, y) rmse_cv(model_lasso).mean() coef = pd.Series(model_lasso.coef_, index=X_train.columns) preds = pd.DataFrame({"preds": model_lasso.predict(X_train), "true": y}) preds["residuals"] = preds["true"] - preds["preds"] preds = np.expm1(model_lasso.predict(X_test)) solution = pd.DataFrame({"id": test.Id, "SalePrice": preds}) solution.to_csv("ridge_sol.csv", index=False)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) # try a smaller alpha las = Lasso(alpha=0.0001, normalize=True) las.fit(X_train, y_train) las.coef_ preds = las.predict(X_test) print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds)) # use LassoCV to select best alpha (tries 100 alphas by default) from sklearn.linear_model import LassoCV lascv = LassoCV(normalize=True, alphas=alpha_range) lascv.fit(X_train, y_train) lascv.alpha_ lascv.coef_ preds = lascv.predict(X_test) print 'RMSE (Lasso CV reg.) =', np.sqrt( metrics.mean_squared_error(y_test, preds)) ############################################################################### ##### Regularization with Logistic Regression ############################################################################### ## TASK: Regularized classification ## FUNCTION: LogisticRegression ## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html ## DATA: Titanic (n=891, p=5 selected, type=classification) ## DATA DICTIONARY: https://www.kaggle.com/c/titanic-gettingStarted/data ########## Prepare data ########## # Get and prepare data
watchlist = [(dtrain, 'train')] # list of things to evaluate and print gbm = xgb.train(params, dtrain, num_boost_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds, verbose_eval=True) # find the best score x_pred = np.expm1(gbm.predict(dtest)) # In[188]: elastic = ElasticNet(alpha=0.0005, l1_ratio=0.9) elastic.fit(X_train, y) elas_preds = np.expm1(elastic.predict(X_test)) # In[189]: lasso_preds = np.expm1(lasso_model.predict(X_test)) final_result = 0.8 * lasso_preds + 0.2 * x_pred #+0.1*elas_preds solution = pd.DataFrame({ "id": test.Id, "SalePrice": final_result }, columns=['id', 'SalePrice']) solution.to_csv("final_upload_11PM2.csv", index=False) # In[ ]:
# # 画图产看预测结果和实际结果的分布情况: # plt.plot(pre[0:100],c='red',label="pre") # plt.plot(data_test_6_label[0:100],c='black',label='true') # plt.title("lasso pre and label distribute circumstance") # plt.legend() # plt.show() # # 0.77679992833927 # # 9.687457422736461 ## 预测效果太差,几乎没有波动 # 使用lasso来做 from sklearn.linear_model import LassoCV model_lasso = LassoCV(alphas=[x for x in np.arange(0, 2, 0.000001)]).fit( train, train_label) y_redge = np.expm1(model_lasso.predict(test)) print(mean_absolute_error(data_test_6_label, y_redge)) # 画图 plt.plot(y_redge[0:100], c='red', label="pre") plt.plot(data_test_6_label[0:100], c='black', label='true') plt.title("lasso pre and label distribute circumstance") plt.legend() plt.show() ''' 倾斜的好!套索的性能更好, 我们就用这个来预测测试集。 套索的另一个优点是它为你做了特性选择—— 设置它认为不重要的特性的系数为零。 让我们来看看系数:数值特性 '''
print(lasso.coef_) print(lasso.score(X_test, y_test)) y_pred = lasso.predict(X_test) mse_lasso = mean_squared_error(y_true=y_test, y_pred=y_pred) num_alphas = 200 alphas = np.linspace(0.01, 10, num_alphas) lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=SEED) lasso_cv.fit(X_train, y_train) print(lasso_cv.intercept_) print(lasso_cv.coef_) print(lasso_cv.score(X_test, y_test)) y_pred = lasso_cv.predict(X_test) mse_lasso_cv = mean_squared_error(y_true=y_test, y_pred=y_pred) n_alphas = 200 ridge_alphas = np.logspace(-2, 6, n_alphas) ridge_cv = RidgeCV(alphas=ridge_alphas, scoring="neg_mean_squared_error", cv=3) ridge_cv.fit(X_train, y_train) print(ridge_cv.intercept_) print(ridge_cv.coef_) print(ridge_cv.score(X_test, y_test)) y_pred = ridge_cv.predict(X_test) mse_ridge_cv = mean_squared_error(y_true=y_test, y_pred=y_pred)
# 'hot', # 'frigid', # 'all_high_snow', # 'all_high_precip', 'cold' ] X_total = store_df[columns_list] X_train = df_train[columns_list] X_test = df_test[columns_list] total_data = X_total.values train_data = X_train.values test_data = X_test.values regr = regr.fit( train_data[0::,1::], train_data[0::,0] ) #print(regr.alpha_,store,item) prediction = regr.predict(test_data[0::,1::]) prediction = np.maximum(prediction, 0.) prediction_total = regr.predict(total_data[0::,1::]) prediction_total = np.maximum(prediction_total, 0.) total_series = pd.Series(prediction_total, unique_dates_int) rmse = np.sqrt(((test_data[0::,0] - prediction) ** 2).mean()) se = ((test_data[0::,0] - prediction) ** 2).sum() # print(rmse,store,item) rmse_total = rmse_total + rmse se_total = se_total + se # plt.scatter(df_test.index,test_data[0::,0] - prediction) # plt.xlabel('date') # plt.xlim(0,1050) # plt.ylabel('truth - pred')
#outliers_id = np.array([31, 463, 524, 633, 969, 971, 1299, 1325]) outliers_id = np.array([523,1298]) X_train = X_train.drop(outliers_id) y = y.drop(outliers_id) def rmse_cv(model): rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5)) return(rmse) #LASSO MODEL clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4]) clf1.fit(X_train, y) lasso_preds = np.expm1(clf1.predict(X_test)) #ELASTIC NET clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9) clf2.fit(X_train, y) elas_preds = np.expm1(clf2.predict(X_test)) #XGBOOST clf3=xgb.XGBRegressor(colsample_bytree=0.4, gamma=0.045, learning_rate=0.07, max_depth=20, min_child_weight=1.5, n_estimators=300, reg_alpha=0.65, reg_lambda=0.45,
print('Condition:', uline=True) for i in range(10): plt.plot(tcmax_condition.alphas_, tcmax_condition.mse_path_[:, i]) plt.xlim([max(tcmax_condition.alphas_), min(tcmax_condition.alphas_)]) plt.axvline(tcmax_condition.alpha_, c='r') plt.xlabel('Alpha') plt.ylabel('CVE') plt.title('TcMax Condition') plt.show() ## Check Heat vs Control p = np.array( list( zip([ 'Heat' if i > np.mean(tcmax_condition.predict(test.T)) else 'Control' for i in tcmax_condition.predict(test.T) ], [ samples['Condition'][i] for i in samples.index if i in test.columns ]))) print(p) p = sum([i[0] == i[1] for i in p]) ## Check empirical p-value for Heat vs Control labels = [ 1 if samples['Condition'][i] == 'Heat' else 0 for i in samples.index if i in base.columns ] dist = [] for i in range(50): np.random.shuffle(labels) # in place
rfRegressorPredicts = np.expm1(rfRegressor.predict(dataTest)) print('The rfRegressor achieves RMSE of ', rmseCV(rfRegressor).mean()) # Train the Lasso Regressor from sklearn.linear_model import Lasso, LassoCV # Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV from sklearn.pipeline import make_pipeline from sklearn.preprocessing import RobustScaler lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.00075, 0.0005, 0.0004], cv=5).fit(dataTrain, targetTrain) print('The amount of penalization in LASSO chosen by cross validation is', lasso.alpha_) # Make it more robust to outliers using the sklearn's Robustscaler() method on pipeline lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1)) lasso.fit(dataTrain, targetTrain) print('The lasso model achieves RMSE of ', rmseCV(lasso).mean()) lassoPredicts = np.expm1(lasso.predict(dataTest)) # Train the GradientBoostingRegressor (using huber loss for robustness to outliers) gboost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', random_state=5) gboost.fit(dataTrain, targetTrain) gboostPredicts = np.expm1(gboost.predict(dataTest)) # print('The GradientBoostingRegressor achieves RMSE of ', rmseCV(gboost)) predictions = lassoPredicts # 1.0/3.0*(rfRegressorPredicts + lassoPredicts + gboostPredicts)
print("Try again for more precision with alphas centered around " + str(alpha)) mod_lasso = LassoCV(alphas=[ alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4 ], max_iter=50000, cv=10) mod_lasso.fit(X_train, y_train) alpha = mod_lasso.alpha_ #print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean()) print("Best alpha :", alpha) print("Lasso RMSE on Training set :", rmse_cv_train(mod_lasso).mean()) y_train_las = mod_lasso.predict(X_train) #y_test_las = lasso.predict(X_test) y_pred1 = mod_lasso.predict(X_train) y_pred2 = mod_lasso.predict(X_test) score1 = np.mean( np.abs((np.expm1(y_train) - np.expm1(y_pred1)) / np.expm1(y_train))) * 100 score2 = np.mean( np.abs((np.expm1(y_test) - np.expm1(y_pred2)) / np.expm1(y_test))) * 100 print("\nLASSO Model Report") print("train {:.2f} | valid {:.2f}".format(float(score1), float(score2))) # Plot residuals plt.scatter(y_train_las, y_train_las - y_train, c="blue",
model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(X_train, y) print "lasso rmse: " print rmse_cv(model_lasso).mean() coef = pd.Series(model_lasso.coef_, index=X_train.columns) print( "Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)]) imp_coef.plot(kind="barh") plt.title("Coefficients in the Lasso Model") plt.show() # 模型预测结果展示 preds = pd.DataFrame({"preds": model_lasso.predict(X_train), "true": y}) preds["residuals"] = preds["true"] - preds["preds"] preds.plot(x="preds", y="residuals", kind="scatter") plt.show() ## 显示估计误差的分布 diff = preds["true"] - preds["preds"] diff.hist() plt.show()
ana_ind_val = ana_clf.predict(X_val_2) X_train_2 = X_tr_2[msk] train_ind = ana_clf.predict(X_train_2) ## fit est1 = LassoCV(normalize=True, random_state=0).fit(X_train_1_scaled[train_ind == 1], np.log1p(y_train[train_ind == 1])) est2 = GradientBoostingRegressor(random_state=0, n_estimators=500, subsample=0.9).fit( X_train_1_scaled, np.log1p(y_train)) ## predict preds1 = np.expm1(est1.predict(X_val_1_scaled)) preds2 = np.expm1(est2.predict(X_val_1_scaled)) fin_preds = [] ana_val_scores = ana_clf.decision_function(X_val_2) ana_val_ind = np.argsort(ana_val_scores) sorted_ana_val_scores = ana_val_scores[ana_val_ind] # get worst outliers ana_val_ind_worst = ana_val_ind[:3] print(sorted_ana_val_scores[:3]) for idx in range(len(y_val)): if idx in ana_val_ind_worst: fin_preds.append(preds2[idx]) else:
" variables") imp_coef = pd.concat( [coef.sort_values().head(10), coef.sort_values().tail(10)]) import matplotlib matplotlib.rcParams['figure.figsize'] = (8.0, 10.0) imp_coef.plot(kind="barh") plt.title("Coefficients in the Lasso Model") plt.show() #let's look at the residuals as well: matplotlib.rcParams['figure.figsize'] = (6.0, 6.0) preds = pd.DataFrame({ "preds": model_lasso.predict(train), "true": train_label }) preds["residuals"] = preds["true"] - preds["preds"] print(preds.head()) preds.plot(x="preds", y="true", kind="scatter", title="preds-true-distribute-lasso2,xgboost" ) # 直接用pandas得dataframe对象画图的时候给出下标就可以画了; plt.show() import xgboost as xgb dtrain = xgb.DMatrix(train, label=train_label)
#split the data into train(0.75) and test(0.25) from sklearn.cross_validation import train_test_split train, test = train_test_split(movie_df, test_size = 0.25) #fit lasso CV model #predictor_list is the list of variables that we want to use for modelling #here I get it from the column names -predictors and movie title. you #can create your own directly from the name list predictor_list = list(train.columns.values) predictor_list.remove('domestic_gross') predictor_list.remove('tomatoRating') predictor_list.remove('new_title') #fit the lasso model, which get selection and estimation at one step. #if you want to see which variable is significant, there is an easy way: #just look at the coefficients, greater than 0 means that it is used for prediction from sklearn.linear_model import LassoCV clf = LassoCV(cv=20).fit(train[predictor_list],train.domestic_gross) Y_pred = clf.predict(test[predictor_list]) print(clf.coef_) #calculate the mean_squared_error from sklearn.metrics import mean_squared_error print mean_squared_error(Y_pred, test.domestic_gross)
lasso.fit(X, y) ridge.fit(X, y) lr.coef_.astype(str) lasso.coef_ ridge.coef_ x_pred = np.linspace(0, 1, 100) X_pred = to_polynomial(x_pred) #plt.plot(x, y_real, '--', alpha=0.5, label='Real function') plt.scatter(x, y, s=100, c='g', label='Data points') plt.plot(x_pred, lr.predict(X_pred), c='red', label='Prediction - No regularization') plt.plot(x_pred, lasso.predict(X_pred), c='blue', alpha=0.5, label='Prediction - L1 reguralization') plt.plot(x_pred, ridge.predict(X_pred), c='purple', alpha=0.5, label='Prediction - L2 reguralization') _ = plt.legend(loc='best') """# Impact of different levels of regularization.""" from sklearn.linear_model import Ridge import seaborn as sns x_pred = np.linspace(0, 1, 100) X_pred = to_polynomial(x_pred) plt.scatter(x, y, s=100, c='g', label='Data points') for i, alpha in enumerate([1.0, 0.01, 0.0001, 0.]): ridge = Ridge(alpha=alpha) ridge.fit(X, y)
class MovieTrainer(object): def __init__(self,training_file,test_file): self._training_pickle=training_file self._test_pickle=test_file #to be defined later self._list_of_dicts=None self._dataframe=None self._features=None self._test_features=None self._labels=None self._clf=None self._training_frame=None self._test_frame=None self._prediction_frame=None #dicts self._actor_dict=None self._director_dict=None self._genre_dict=None self._production_house=None def _load_dataframe(self): if os.path.isfile(self._training_pickle) ==True: self._training_dict=pickle.load(file(self._training_pickle)) else: raise AttributeError("Cannot find pickle file:%s"%self._training_pickle) if os.path.isfile(self._test_pickle) ==True: self._test_dict=pickle.load(file(self._test_pickle)) else: raise AttributeError("Cannot find pickle file:%s"%self._test_pickle) #load pandas frame self._training_frame=pd.DataFrame(self._training_dict) self._test_frame=pd.DataFrame(self._test_dict) #drop movies with no names self._training_frame.dropna(subset=["moviename"]) self._test_frame.dropna(subset=["moviename"]) return #raise error? def _addtodict(self,name,this_dict): if this_dict.has_key(name): this_dict[name]+=1 else: this_dict[name]=1 return def _modify_string(self,playername): playername = re.sub('^\s+|\s+$','', playername) playername=re.sub('\s+','_',playername) playername=re.sub('\*','',playername) return playername #this function creates a list of features #corresponding to the most frequent actors #in a movie def _create_playerdict(self,frame,colname,num_features): playerdict={} for index in frame.index: #for each row, we have list of actors #like ['Sandra Bullock', 'Melissa McCarthy'] playerlist=frame.ix[index,colname] if type(playerlist)!=float: #only actors have multiple list members, other players #like director don't if colname=="actors": for playername in playerlist: #remove spaces, *, leading trailing spaces playername=self._modify_string(playername) self._addtodict(playername,playerdict) else: playerlist=self._modify_string(playerlist) self._addtodict(playerlist,playerdict) counter=0 feature_list=[] #sort the dict to get players with highest number of movies for key,value in sorted(playerdict.items(),key=lambda x:x[1],reverse=True): #print key,value feature_list.append(key) counter+=1 if counter>num_features: break return feature_list #this function returns a value of the player features for #each movie def _create_player_features(self,frame,colname,num_features): #feature_list is all names of players with most movies feature_list=self._create_playerdict(frame,colname,num_features) actor_frame = pd.DataFrame() for player in feature_list: feature_name=colname+":"+player actor_frame[feature_name]=pd.Series(0,index=frame.index) bigplayer_name="feature:big_"+colname #TODO: take out of loop actor_frame[bigplayer_name]=pd.Series(0,index=frame.index)#big actors directors present or not? for index in frame.index: playerval=frame.ix[index,colname] if type(playerval)!=float: #playerval is not None if colname=="actors": for actor in playerval: actor=self._modify_string(actor) if actor in feature_list: thisfeature=colname+":"+actor actor_frame.loc[index,thisfeature]=1 else: playerval=self._modify_string(playerval) if playerval in feature_list: thisfeature=colname+":"+playerval actor_frame.loc[index,thisfeature]=1 actor_frame.loc[index,bigplayer_name]=1 else: actor_frame.loc[index,bigplayer_name]=0 return actor_frame def _create_theater_features(self,frame): #add feature column theater_frame=pd.DataFrame() theater_frame["feature:num_theaters"]=pd.Series(0,index=frame.index) for index in frame.index: theater_list=frame.ix[index,"theater_list"] if type(theater_list)==list and len(theater_list)>0: theater=theater_list[0] theater=re.sub(',','',theater) if re.search('\d+',theater) is not None: theater_frame.loc[index,"feature:num_theaters"]=int(theater) else: theater_frame.loc[index,"feature:num_theaters"]=0 else: theater_frame.loc[index,"feature:num_theaters"]=0 return theater_frame def _first_weekend_rank(self,frame): #todo: try to merge with create theater features weekend_frame = pd.DataFrame() weekend_frame["feature:rank"]=pd.Series(0,index=frame.index) for index in frame.index: rank_list=frame.ix[index,"rank_list"] if type(rank_list)==list and len(rank_list)>0: rank=rank_list[0] rank=re.sub(',','',rank) if re.search('\d+',rank) is not None: weekend_frame.loc[index,"feature:rank"]=int(rank) else: weekend_frame.loc[index,"feature:rank"]=1000#some large number? or zero? else: weekend_frame.loc[index,"feature:rank"]=1000 return weekend_frame def _create_running_time_feature(self,frame): runtime_frame = pd.DataFrame() runtime_frame["feature:runtime"]=pd.Series(0,index=frame.index) for index in frame.index: running_time=frame.ix[index,"runtime"] if type(running_time)!= float: #not NaN pattern='(\d+).+\s(\d+)' hrmin=re.match(pattern,running_time) if hrmin is not None: hrs=hrmin.group(1) mins=hrmin.group(2) tot_time=int(hrs)*60+int(mins) runtime_frame.loc[index,"feature:runtime"]=tot_time else: runtime_frame.loc[index,"feature:runtime"]=0 else: runtime_frame.loc[index,"feature:runtime"]=0 return runtime_frame def _create_release_date_feature(self,frame): monthlist=["January","February","March","April","May","June"\ "July","August","September","October","November","December"] month_frame = pd.DataFrame() for month in monthlist: feature_name="feature:release_"+month month_frame[feature_name]=pd.Series(0,index=frame.index) for index in frame.index: release_date=frame.ix[index,"release_date"] if type(release_date)!=float: pattern='(\S+)\s(\d+)' monthday=re.match(pattern,release_date) if monthday is not None: month=monthday.group(1) day=monthday.group(2) if month in monthlist: thisfeature="feature:release_"+month month_frame.loc[index,thisfeature]=1 return month_frame def _extract_features(self,frame,isTraining=True): """ extracts features from training and test frame all major data munging, cleaning takes place here """ #pass #we will make clean_frame as the data frame, #then we will define the training/test frame #and add each feature as a dataframe #and finally concatenate the features #check if labels exist for these movies clean_data=frame[pd.notnull(frame["domestic_gross"])] list_of_frames=[] #no of theaters it opened at in the first week #keep this as first feature so that you can plot using this list_of_frames.append( self._create_theater_features(clean_data) ) print "Created Theater Feature..." list_of_frames.append( self._first_weekend_rank(clean_data) ) print "Created Rank Feature..." list_of_frames.append( self._create_running_time_feature(clean_data) ) print "Created running time Feature..." list_of_frames.append( self._create_release_date_feature(clean_data) ) print "Created release date Feature..." #create player features list_of_frames.append( \ self._create_player_features(clean_data,"actors",5) ) list_of_frames.append( \ self._create_player_features(clean_data,"director",5) ) list_of_frames.append( self._create_player_features(clean_data,"distributor",5) ) list_of_frames.append( self._create_player_features(clean_data,"genre_toplist",5) ) list_of_frames.append( self._create_player_features(clean_data,"mpaa_rating",5) ) print "Created player Features..." #check dataframe shapes for frames in list_of_frames: assert frames.shape[0] == clean_data.shape[0] #concatenate the dataframes final_frame = pd.concat(list_of_frames,axis = 1) final_frame.to_csv("Training/training_frame.csv") #get training labels if isTraining == True: labels_arr=self._extract_labels(clean_data) else: prediction_frame=clean_data[["moviename","genre_toplist","actors"]] n_samples=len(final_frame.index) n_features=len(final_frame.columns) #from Dataframe to numpy array feature_arr=final_frame.values.reshape(n_samples,n_features) print "Created All Features....." if isTraining is True: return feature_arr,labels_arr else: return feature_arr,prediction_frame #plt.plot(theater_arr,self._clf.predict(theater_arr),'r-',linewidth=2) #plt.show() return def _extract_labels(self,frame): df_Y=frame["domestic_gross"].values gross_list=df_Y.tolist() for i in range(len(gross_list)): gross_list[i]=int(gross_list[i]) max_gross=np.max(gross_list) #print max_gross gross_list=[x/max_gross for x in gross_list] n_samples=len(gross_list) gross_arr=np.array(gross_list).reshape(n_samples,1) return gross_arr def _get_top_actors(self,actorlist): top_actors=[None,None,None] if type(actorlist) ==float: return top_actors; counter=0 for actor in actorlist: top_actors[counter]=self._modify_string(actor) counter+=1 if counter==2: break return top_actors def explore_data(self): """ plots and prints various kinds of stuff to test out the data change, comment and uncomment here directly """ if self._training_frame is None: self._load_dataframe() #col_list.remove('actors') #print col_list #self._training_frame.drop(col_list,axis=1,inplace=True) #print self._training_frame.ix[500:510] #print len(self._training_frame.index) #only_budget=self._training_frame[pd.isnull(self._training_frame["domestic_gross"])] #print len(only_budget.index) #actors_there=self._training_frame[pd.notnull(self._training_frame["actors"])] #print len(actors_there.index) #print actors_there.head() #director_there=self._training_frame[pd.notnull(self._training_frame["director"])] #print len(director_there.index) #print director_there.head() pass def top_5_genres(self): if self._training_frame is None: self._load_dataframe() genre_list=self._create_playerdict(self._training_frame,"genre_toplist",5) print genre_list def train_2013(self): #pass self._load_dataframe() self._training_frame.to_csv("Training/raw_frame.csv") total_features,total_labels=self._extract_features(self._training_frame,isTraining=True) total_labels=np.ravel(total_labels) print type(total_features) print type(total_labels) #create train and test split self._features, test_features, self._labels, test_labels =\ train_test_split(total_features, total_labels, test_size = 0.33) print self._features.shape print self._labels.shape print test_features.shape print test_labels.shape cv_outer = KFold(self._labels.shape[0],n_folds=5) self._clf = LassoCV(eps=0.01, n_alphas=10,cv =5) cross_val_arr=cross_val_score(self._clf,self._features,self._labels,cv=cv_outer) print "Finished Training....." r_sq=np.mean(cross_val_arr) print "R Square for training set: ",r_sq self._clf.fit(self._features,self._labels) plt.plot(test_labels, self._clf.predict(test_features),'ro',linewidth=2) plt.plot(np.arange(0,1.,.1),np.arange(0,1.,.1),'b-',linewidth=2) plt.xlabel("Actual Gross") plt.ylabel("Predicted Gross") plt.show() def test_2014(self): #check if already trained if self._clf is None: self.train_2013() print "Generating Test Features..." self._test_features,self._prediction_frame=self._extract_features(\ self._test_frame,isTraining=False) self._prediction_frame["prediction"]=self._clf.predict(self._test_features) print "Finished Testing..." #sanity check and normalize self._prediction_frame["prediction"]=self._prediction_frame["prediction"].apply(\ lambda x: 0 if x<0 else x) maxpred=self._prediction_frame["prediction"].max() if maxpred>1: self._prediction_frame["prediction"]=self._prediction_frame["prediction"].apply(\ lambda x: x/maxpred) print self._prediction_frame.head() def save_db(self,filename): con=sqlite3.connect(filename) cursor=con.cursor() cursor.execute('DROP TABLE IF EXISTS currentmovies') cursor.execute('CREATE TABLE currentmovies(\ moviename VARCHAR(255) ,\ genre VARCHAR(255),\ prediction INT,\ actor1 VARCHAR(255),\ actor2 VARCHAR(255),\ actor3 VARCHAR(255))') for index in self._prediction_frame.index: movname=self._prediction_frame.ix[index]["moviename"].encode('utf-8') pred=self._prediction_frame.ix[index]["prediction"] genre=self._prediction_frame.ix[index]["genre_toplist"].encode('utf-8') (actor1,actor2,actor3)=self._get_top_actors(self._prediction_frame.ix[index]["actors"]) if type(movname)==float and math.isnan(movname)==True: continue print movname,genre,pred cursor.execute('INSERT INTO currentmovies\ VALUES(?,?,?,?,?,?)',(movname,genre,pred,actor1,actor2,actor3)) con.commit() con.close()
# try a smaller alpha las = Lasso(alpha=0.0001, normalize=True) las.fit(X_train, y_train) las.coef_ preds = las.predict(X_test) np.sqrt(metrics.mean_squared_error(y_test, preds)) # use LassoCV to select best alpha (tries 100 alphas by default) from sklearn.linear_model import LassoCV lascv = LassoCV(normalize=True) lascv.fit(X_train, y_train) lascv.alpha_ lascv.coef_ preds = lascv.predict(X_test) np.sqrt(metrics.mean_squared_error(y_test, preds)) ## TASK: Regularized classification ## FUNCTION: LogisticRegression ## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html ## DATA: Titanic (n=891, p=5 selected, type=classification) ## DATA DICTIONARY: https://www.kaggle.com/c/titanic-gettingStarted/data # define X and y feature_cols = ["pclass", "sex", "age", "embarked_Q", "embarked_S"] X = titanic[feature_cols] y = titanic.survived # split into train/test
clf.fit_cv(X_train, Y_train, [(X_cv, Y_cv)]) else: clf.fit(X_train, Y_train) one_result = clf.predict(X_cv) blend_train[cv_index, j] = one_result cv_score = gini_normalized(Y_cv, blend_train[cv_index, j]) cv_results[j, i] = cv_score score_mse = metrics.mean_absolute_error(Y_cv, one_result) print ('Fold [%s] norm. Gini = %0.5f, MSE = %0.5f' % (i, cv_score, score_mse)) blend_test_j[:, i] = clf.predict(X_test) blend_test[:, j] = blend_test_j.mean(1) print ('Clf_%d Mean norm. Gini = %0.5f (%0.5f)' % (j, cv_results[j,].mean(), cv_results[j,].std())) end_time = datetime.now() time_taken = (end_time - start_time) print ("Time taken for pre-blending calculations: {0}".format(time_taken)) print ("CV-Results", cv_results) print ("Blending models.") bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True) bclf.fit(blend_train, Y_dev) Y_test_predict = bclf.predict(blend_test) cv_score = cv_results.mean() print ('Avg. CV-Score = %s' % (cv_score)) submission = pd.DataFrame({"Id": test_ids, "Hazard": Y_test_predict}) submission = submission.set_index('Id') submission.to_csv("farons_solution.csv")
my_l = my_l[:-1] my_list_arr = np.array(my_l) X_train = df.values[:1495, :126] Y_train = df.values[:1495, 126:] X_test = df.values[1495:, :126] Y_test = df.values[1495:, 126:] arr = np.linspace(0.001, 100.1, 3000) print(arr) model = LassoCV(alphas=arr, cv=5) model.fit(X_train, Y_train) preds = model.predict(X_test) error = mean_squared_error(Y_test, preds) print("Score:", model.score(X_test, Y_test)) print("Test Error:", error) print("penalty", model.alpha_) final_features = model.coef_ # print("Coef_path:",model.coef_) # print(final_features.shape) print(type(final_features)) temp = 0 for i in range(0, 126): if (final_features[i] != 0): print(my_list_arr[i], ":", final_features[i])
coef_path_lasso_cv.fit(X,y) coef_path_binary_x_logistic_cv.fit(binary_X,binary_y) coef_path_logistic_cv.fit(X,binary_y) coef_path_elastic_cv.fit(X,y) forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') binary_x_logistic_cv_score = cross_validation.cross_val_score(coef_path_binary_x_logistic_cv, binary_X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') forest_results_parameters = [ coef_path_forest_cv.predict(X), coef_path_forest_cv.get_params, coef_path_forest_cv.feature_importances_, coef_path_forest_cv.classes_, coef_path_forest_cv.n_classes_] forest_scores = [forest_cv_score, classification_report(binary_y, forest_results_parameters[0]), 'forest'] lasso_results_parameters = [coef_path_lasso_cv.predict(X), coef_path_lasso_cv.get_params, coef_path_lasso_cv.alphas_, coef_path_lasso_cv.coef_] lasso_scores = [lasso_cv_score, r2_score(y,lasso_results_parameters[0]), 'lasso'] elastic_results_parameters = [ coef_path_elastic_cv.predict(X), coef_path_elastic_cv.get_params, coef_path_elastic_cv.alphas_ , coef_path_elastic_cv.coef_] elastic_scores = [elastic_cv_score, r2_score(y,elastic_results_parameters[0]), 'elastic'] logistic_results_parameters = [coef_path_logistic_cv.predict(X), coef_path_logistic_cv.get_params, coef_path_logistic_cv.coef_] logistic_scores = [logistic_cv_score, classification_report(binary_y, logistic_results_parameters[0]), 'logistic'] binary_x_logistic_results_parameters = [coef_path_binary_x_logistic_cv.predict(X), coef_path_binary_x_logistic_cv.get_params, coef_path_binary_x_logistic_cv.coef_] binary_x_logistic_scores = [binary_x_logistic_cv_score, classification_report(binary_y, binary_x_logistic_results_parameters[0]), 'binary_logistic']
lasso = LassoCV(alphas=[ 0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1 ], max_iter=50000, cv=10) lasso.fit(X_train, y) alpha = lasso.alpha_ print "Best alpha :", alpha print "Trying alphas centered around " + str(alpha) lasso = LassoCV(alphas=[ alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4 ], max_iter=50000, cv=10) lasso.fit(X_train, y) lasso_pred1 = np.expm1(lasso.predict(X_test)) #----------------------------Take weighted sum of lasso predictions and XGB predictions.------------# final1234 = 0.7 * lasso_pred1 + 0.3 * xgb_p # final1234 f_sub = pd.DataFrame({"id": test.Id, "SalePrice": final1234}) f_sub.to_csv("Sol_1234.csv", index=False)
msg("Fitting!") weights = np.ones(train.shape[0]) do_statsmodels=True if do_statsmodels: ols = sm.wls(formula=formula, data=train, weights=weights).fit() print ols.summary() msg("Making predictions for all playergames") yy_df['ols_prediction'] = ols.predict(yy_df) else: ols_lr = LassoCV(n_jobs=-1, verbose=True) X = train[rhs_cols] y = train['elo'] ols_lr.fit(X,y) yy_df['ols_prediction'] = ols_lr.predict(X) yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs() yy_df['training'] = (yy_df['gamenum'] % 3) insample_scores = yy_df.groupby('training')['ols_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std}) print insample_scores msg("Error summary by ELO:") elo_centuries = cut(yy_df['elo'], 20) print yy_df.groupby(elo_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean}) msg("Error summary by gamenum:") gamenum_centuries = cut(yy_df['gamenum'], 20) print yy_df.groupby(gamenum_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean}) msg("Writing yy_df back out with ols predictions inside")
############################################################################################################# # 2. Lasso ############################################################################################################# # The parameter alpha increases L1 penalty when smaller. Alpha = 0 is linear regression. # LassoCV incorporates iterating through many alphas and CV as well. Also Lasso can be used for dimensionality # reduction as it is able to set coefficients to 0. from sklearn.linear_model import LassoCV from sklearn.metrics import mean_squared_error from sklearn.metrics import r2_score lasso = LassoCV(alphas=np.logspace(-10, 10, 10), normalize=True, cv=10, positive=False) lasso.fit(xtrain, ytrain) # Train dataset performance lasso_train_pred = lasso.predict(xtrain) lasso_train_r2 = r2_score((ytrain), lasso_train_pred) lasso_train_error = np.sqrt(mean_squared_error(ytrain, lasso_train_pred)) # Test dataset performance lasso_test_pred = lasso.predict(xtest) lasso_test_r2 = r2_score((ytest), lasso_test_pred) lasso_test_error = np.sqrt(mean_squared_error(ytest, lasso_test_pred)) # Build coefficients table from pandas import DataFrame lassocoeff = DataFrame(data.columns, columns = ['Features']) lassocoeff['Coefficients'] = lasso.coef_ print 'LASSO ------------------------------------------------------------------------' print '\nThe alpha (L1) level selected: {}' .format(lasso.alpha_)