def remove_foreground_glm( x, y, spatial_mask=None, spectral_mask=None, alphas=None, l1_ratio=1.): """Summary Args: x (TYPE): Description y (TYPE): Description spatial_mask (TYPE, optional): Description spectral_mask (TYPE, optional): Description alphas (TYPE, optional): Description Returns: TYPE: Description """ # cast to double and reshape x_rs = np.float64(x.reshape((x.shape[0], -1))).T y_rs = np.float64(y.flatten()) if spatial_mask is None: spatial_mask_rs = np.ones_like(y_rs, dtype=bool) else: spatial_mask_rs = spatial_mask.flatten() if spectral_mask is None: spectral_mask = np.ones(x_rs.shape[1], dtype=bool) if alphas is not None: alphas = np.atleast_1d(alphas) # fit GLM if l1_ratio == 1.: reg = LassoCV( positive=True, alphas=alphas, n_jobs=-1, max_iter=5000 ) elif l1_ratio == 0.: reg = RidgeCV( alphas=alphas, ) else: reg = ElasticNetCV( positive=True, alphas=alphas, n_jobs=-1, l1_ratio=l1_ratio ) reg.fit(x_rs[spatial_mask_rs][:, spectral_mask], y_rs[spatial_mask_rs]) y_model = reg.predict(x_rs[:, spectral_mask]).reshape(y.shape) glm_coeffs = np.zeros(x_rs.shape[1], dtype=np.float32) glm_coeffs[spectral_mask] += reg.coef_ return y_model, reg, glm_coeffs
def lassoCV_regression(data,target,alphas): clf=LassoCV() sfm = SelectFromModel(clf, threshold=0.25) sfm.fit(data, target) n_features = sfm.transform(data).shape[1] while n_features > 2: sfm.threshold += 0.1 data_transform = sfm.transform(data) n_features = data_transform.shape[1] rmses=[] kf=KFold(len(target),10,True,None) for train_index, test_index in kf: data_train,data_test=data_transform[train_index],data_transform[test_index] target_train,target_test=target[train_index],target[test_index] clf.fit(data_train,target_train) rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2)) rmses.append(rmse) x0=np.arange(1,11) plt.figure() plt.plot(x0,rmses,label='LassoCV') plt.legend() plt.show() return rmses
def predict(self,trains_x,train_y,tests_x,parameters,times=10,isFile=True,foldername="blend-dir"): """ Ensamble many features and regression :params train_X: dictionary for training :params train_y: testing vector """ #parameter_get test_data_sample = tests_x.values()[0] if not os.path.exists(foldername): os.makedirs(foldername) skf = None kfold_file = foldername + "/kfold_index.pkl" if os.path.exists(kfold_file): skf = pickle.load(open(kfold_file,"r")) else: skf = KFold(n=len(train_y),n_folds=times,shuffle=True) pickle.dump(skf,open(kfold_file,"w")) blend_train = np.zeros((len(train_y),len(parameters))) blend_test = np.zeros((len(test_data_sample),len(parameters))) for j,parameter in enumerate(parameters): train_x = trains_x[parameter['data']] test_x = tests_x[parameter['data']] blend_test_tmp = np.zeros((len(test_data_sample),len(parameters))) #file path check for i, (train_index,valid_index) in enumerate(skf): clf = model_select(parameter['parameter']) train = train_x[train_index] train_valid_y = train_y[train_index] kfold_filepath = "./" + foldername + "/parameter_{}_kfold_{}.pkl".format(j,i) if os.path.exists(kfold_filepath): blend_train_prediction,blend_test_prediction = pickle.load(open(kfold_filepath,"r")) blend_train[train_index,j] = np.expm1(clf.predict(train)) blend_test_tmp[:,i] = np.expm1(clf.predict(test_x)) else: clf.fit(train,np.log1p(train_valid_y)) blend_train_prediction = np.expm1(clf.predict(train)) blend_test_prediction = np.expm1(clf.predict(test_x)) pickle.dump((blend_train_prediction,blend_test_prediction),open(kfold_filepath,"w")) blend_train[train_index,j] = blend_train_prediction blend_test_tmp[:,i] = blend_test_prediction blend_test[:,j] = blend_test_tmp.mean(1) #Blending Model bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True) bclf.fit(blend_train, train_y) y_test_predict = bclf.predict(blend_test) return y_test_predict
def bagging(self,trains,tests,train_y,model_name=None): blend_train = trains.T bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True) bclf.fit(blend_train, train_y) y_test_predict = bclf.predict(tests.T) train_predict = bclf.predict(trains.T) return train_predict,y_test_predict
def fit_Lasso(features_train, labels_train, features_pred): model = LassoCV() model.fit(features_train, labels_train) mse = model.mse_path_ print "LASSO - Mean square error: ", mse.shape # Test the model labels_pred = model.predict(features_pred) return labels_pred
def lassocv_feature_select(df): """ 通过LassoCV 进行特征选择 """ X = df.drop(['status'],axis=1) y = df['status'] model_lasso = LassoCV(alphas = [0.1,1,0.001, 0.0005]) model_lasso.fit(X,y) coef = pd.Series(model_lasso.coef_,index=X.columns) print(coef.sort_values(ascending=False))
def make_model_and_predict(train_file, test_file): """Given name of training csv file, name of test csv file, constructs a random forest model and outputs predictions to a time-stampled csv file. If the test_file has SalaryNormalized as an attribute, it will score the model and write the result in the file "score<datetime>" """ train = pd.read_csv(train_file) valid = pd.read_csv(test_file) number_of_word_features = 200 title_words = count_words_in_column(train, "Title") key_count_pairs = [(k,v) for (k,v) in title_words.items() if k not in stopwords.words('english')] key_count_pairs.sort(key=lambda (k,v): -v) for word, count in key_count_pairs[:number_of_word_features]: add_appearance_count_feature(train, word, "Title") add_appearance_count_feature(valid, word, "Title") group_features = ["LocationNormalized", "Category", "Company", "SourceName"] for f in group_features: continuize_feature(train, valid, f, "SalaryNormalized") feature_columns = train.columns[12:] feature=train[feature_columns] label=train.SalaryNormalized clf = LassoCV() clf.fit(feature, label) valid_salary_predict = clf.predict(valid[feature_columns]) valid["SalaryNormalized_Predict"] = valid_salary_predict date_string = re.sub("[ :.]", "", str(datetime.datetime.now())) predict_filename = 'predict' + date_string + '.csv' score_filename = 'score' + date_string + '.txt' with open(predict_filename,'wb') as f: valid[["Id","SalaryNormalized_Predict"]].to_csv(f, index=False, header=False) ##Computes average RMS error and writes score to file if hasattr(valid, 'SalaryNormalized'): score = 0 for i,_ in enumerate(valid["SalaryNormalized_Predict"]): score += (valid.SalaryNormalized[i] - valid.SalaryNormalized_Predict[i]) **2 score = math.sqrt(score/len(valid["SalaryNormalized_Predict"])) with open (score_filename, 'wb') as f: f.write("Train: " + train_file + "\n") f.write("Test: " + test_file + "\n") f.write("Score: " + str(score) + "\n")
def lassocv_n_random_lasso(X, y, n_iter = 30, test_size = 0.2, max_iter = 50000, n_resampling = 2000): # find a good alpha using cv ss = ShuffleSplit(X.shape[0], n_iter, test_size) reg = LassoCV(normalize = True, cv = ss, max_iter = max_iter) reg.fit(X, y) reg = RandomizedLasso(alpha = reg.alpha_, n_resampling = n_resampling, max_iter = max_iter, normalize = True) reg.fit(X, y) rank = reg.scores_.argsort()[::-1] return (rank, reg.scores_[rank])
def lassoRegularization(X,Y): """ :param X: data consisting of features (excluding class variable) :param Y: column vector consisting of class variable :return: report best RMSE value for lasso regularization """ tuningAlpha = [0.1,0.01,0.001] lasso = LassoCV(normalize=True, alphas=tuningAlpha, cv=10) lasso.fit(X,Y) prediction = lasso.predict(X) print print "LASSO REGULARIZATION" print "Best Alpha value for Lasso Regularization : " + str(lasso.alpha_) print 'Best RMSE for corresponding Alpha =', np.sqrt(mean_squared_error(Y, prediction))
def lasso_cv(x, y, x_pred=None, max_deg=3, cv=10, max_iter=1e3, return_model=False): """LASSO polynomial fit with cross-validation. Regularized polynomial regression (by penalized least-squares) from a range of degrees up to n = max_deg. The LASSO regression minimises MSE and penalizes the size of the parameter vector using L1-norm, which leads to fewer coefficients in the fitted model. - The 'alpha' parameter (amount of penalization) is selected by k-fold CV. - Predicts fitted model on given values 'x_pred' (default use 'x'). - Supports NaNs. """ ind, = np.where((~np.isnan(x)) & (~np.isnan(y))) x_, y_ = x[ind], y[ind] X_ = dmatrix('C(x_, Poly)') if x_pred is None: X = dmatrix('C(x, Poly)') # predict on original values else: X = dmatrix('C(x_pred, Poly)') # predict on given values lasso = LassoCV(cv=cv, copy_X=True, normalize=True, max_iter=max_iter) lasso = lasso.fit(X_[:,1:max_deg+1], y_) y_pred = lasso.predict(X[:,1:max_deg+1]) if return_model: y_pred = [y_pred, lasso] return y_pred
class Trainer: clf = None svm = None def __init__(self): if config.model is 'SVM': self.svm = svm.SVC(kernel='linear', shrinking=True, verbose=False) params = { 'C': np.logspace(-5, -1, num=20), # Range of C values } self.clf = GridSearchCV(self.svm, params, cv = 5, # k-fold CV n_jobs = cpu_count(), # Parallelize over CPUs verbose = 1, ) elif config.model is 'Regression': self.clf = LassoCV( cv = 3, max_iter = 2000, n_jobs = cpu_count(), verbose = True, ) def train(self, featMat, persist=True): # Preprocess scaler = StandardScaler() featMat.X = scaler.fit_transform(featMat.X, featMat.y) # Save preprocess output self.scaler = scaler if persist: joblib.dump(scaler, 'preprocess.out') # Perform CV print('Running trainer on %d rows of data with %d features.' % featMat.X.shape) self.clf.fit(featMat.X, featMat.y) # Save CV output if config.model is 'SVM': self.estimator = self.clf.best_estimator_ elif config.model is 'Regression': self.estimator = self.clf print(self.estimator) if persist: joblib.dump(self.clf, 'cv.out')
class LocalRegression: """This class implements "local" regression. Given a set of training data and a set of unknown data, iterate through each unknown spectrum, find the nearest training spectra, and generate a model. Each of these local models is optimized using built-in cross validation methods from scikit.""" def __init__(self, params, n_neighbors = 250): """Initialize LocalRegression Arguments: params = Dict containing the keywords and parameters for the regression method to be used. Keyword arguments: n_neighbors = User-specified number of training spectra to use to generate the local regression model for each unknown spectrum. """ self.model = LassoCV(**params) # For now, the only option is LASSO. Other methods to be added in the future # params is a dict containing the keywords and parameters for LassoCV self.neighbors = NearestNeighbors(n_neighbors=n_neighbors) def fit_predict(self,x_train,y_train, x_predict): """Use local regression to predict values for unknown data. Arguments: x_train = The training data spectra. y_train = The values of the quantity being predicted for the training data x_predict = The unknown spectra for which y needs to be predicted. """ self.neighbors.fit(x_train) predictions = [] coeffs = [] intercepts = [] for i in range(x_predict.shape[0]): print('Predicting spectrum ' + str(i + 1)) x_temp = np.array(x_predict[i]) foo, ind = self.neighbors.kneighbors([x_temp]) x_train_local = np.squeeze(x_train[ind]) y_train_local = np.squeeze(y_train[ind]) cv = GroupKFold(n_splits=3) cv = cv.split(x_train_local, y_train_local, groups=y_train_local) self.model.fit(x_train_local, y_train_local) predictions.append(self.model.predict([x_temp])[0]) coeffs.append(self.model.coef_) intercepts.append(self.model.intercept_) return predictions, coeffs, intercepts
def lassocvclassifier(training_samples, eval_samples, vectorizer, do_grid_search=False): X_train, Y_train = training_samples X_eval, Y_eval = eval_samples #clf = SGDClassifier(loss='log', penalty= 'l2',l1_ratio=0.0, n_iter=30, shuffle=True, verbose=False, # n_jobs=4, alpha=1e-4, average=True, class_weight=None) clf = LassoCV() clf.fit(X_train, Y_train) #y_train_true, y_train_pred = Y_train, clf.predict(X_train) print_top_10_words = True scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss') print scores, np.mean(scores), np.median(scores) print(clf) #scores = cross_validation.cross_val_score(clf.best_estimator_, X_train, Y_train, cv=10, scoring='log_loss') #print scores, np.mean(scores), np.median(scores) y_true, y_pred = Y_eval, clf.predict(X_eval) y_prob = clf.predict_proba(X_eval)
def _regression( self, i_start, i_end ): """ Model of Lasso """ X, y = self._AssembleRegressionData_i( i_start, i_end ); lasso = LassoCV( cv = 10 ); lasso.fit_intercept = True; lasso.fit( X, y ); res = { "reg_result" : lasso,\ # Add reg_coefficients in the future! # Extract Coefficients from LassoCV doesn't quite work. Need to continue # Note: this needs to be updated to show coefficients for predict!!!!!!!! # reg_coefficients = list( lasso.coef_ ); # print reg_coefficients }; return res;
def get_model_per_cluster(X, Y): model_per_cluster = {} for c in X.cluster.unique(): X_cluster = X[X.cluster==c] Y_true = Y[Y.cluster == c].ALSFRS_slope regr = LassoCV(cv=5) regr.fit(X_cluster, Y_true) print 'cluster: %d size: %s' % (c, Y_true.shape) Y_predict = regr.predict(X_cluster) print "\t RMS error (0 is perfect): %.2f" % np.sqrt(np.mean( (Y_predict - Y_true) ** 2)) regression_SS = ((Y_predict - Y_true) ** 2).sum() residual_SS =((Y_true - Y_true.mean()) ** 2).sum() print '\t coefficient of determination R^2 = %.2f ' % (1.0 - regression_SS/residual_SS) # regr.score(X_cluster, Y_true) cov = sum((Y_predict - Y_predict.mean())*(Y_true - Y_true.mean())) Y_predict_std = np.sqrt(sum((Y_predict - Y_predict.mean())**2)) Y_true_std = np.sqrt(sum((Y_true - Y_true.mean())**2)) print '\t pearson correlation r = %.2f ' % (cov/(Y_predict_std*Y_true_std)) # scipy.stats.pearsonr(Y_predict, Y_true)[0] print "3 sample predictions: ", regr.predict(X_cluster)[:3] model_per_cluster[c] = {"cluster_train_data_means": X_cluster.mean(), "model" : regr} return model_per_cluster
def Lasso_model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice real_train_tar=np.expm1(train_linear_tar) x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) write_pkl(lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl') return test_prediction_lasso
) feature_engg_linreg_mse_test = metrics.mean_squared_error( test_data["casual_log"], feature_engg_linreg_model.predict(test_data.drop(target, axis=1)) ) # Not much difference? > Doesn't look like we are overfitting! # But how to perform shrinkage/penalized regression in general? from sklearn.linear_model import LassoCV feature_engg_lassocv_model = LassoCV(max_iter=50, cv=3, n_jobs=-1, random_state=42) feature_engg_lassocv_model.fit(train_data.drop(target, axis=1), train_data["casual_log"]) feature_engg_lassocv_mse_train = metrics.mean_squared_error( train_data["casual_log"], feature_engg_lassocv_model.predict(train_data.drop(target, axis=1)) ) feature_engg_lassocv_mse_test = metrics.mean_squared_error( test_data["casual_log"], feature_engg_lassocv_model.predict(test_data.drop(target, axis=1)) ) # Check the performance on test set print feature_engg_linreg_mse_test print feature_engg_lassocv_mse_test # Penalization decreases performance?
"""Вопрос 2. Какой признак линейная регрессия считает наиболее сильно влияющим на качество вина?""" linreg_coef = pd.DataFrame({'coef': linreg.coef_}, index=X_train.columns) linreg_coef.sort_values(by='coef', inplace=True) lasso1 = Lasso(alpha=0.01, random_state=17) lasso1.fit(X_train_scaled, y_train) lasso1_coef = pd.DataFrame({'coef': lasso1.coef_}, index=X_train.columns) lasso1_coef.sort_values(by='coef', inplace=True) alphas = np.logspace(-6, 2, 200) lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=17) lasso_cv.fit(X_train_scaled, y_train) """Вопрос 3. Какой признак "обнулился первым" в настроенной модели LASSO?""" lasso_cv_coef = pd.DataFrame({'coef': lasso_cv.coef_}, index=X_train.columns) lasso_cv_coef.sort_values(by='coef', inplace=True) print("Mean squared error (train): %.3f" % mean_squared_error(y_train, lasso_cv.predict(X_train_scaled))) print("Mean squared error (test): %.3f" % mean_squared_error(y_holdout, lasso_cv.predict(X_holdout_scaled))) forest = RandomForestRegressor(random_state=17) forest.fit(X_train, y_train) print("Mean squared error (train): %.3f" % mean_squared_error(y_train, forest.predict(X_train))) print("Forest Mean squared error (cv): %.3f" % cross_val_score(forest, X_train, y_train, scoring='neg_mean_squared_error').mean())
rf_mask = rf.feature_importances_ > 0.10 #reduced_X = X.loc[:, rf_mask] #3.2 RF + RFE rfe = RFE(estimator=rf, n_features_to_select=5, step=2, verbose=0) rfe.fit(X_train, y_train) rfe_mask = rfe.support_ print("{0:.1} RF RFE R^2 on test set.".format(rfe.score(X_test, y_test))) mse = MSE(y_test, rfe.predict(X_test)) print("{} RF RFE RMSE on test set.".format(mse**0.5)) #3.3 LassoCV lasso_CV = LassoCV(n_alphas=250, cv=4, n_jobs=2) lasso_CV.fit(X_train, y_train) lcv_mask = lasso_CV.coef_ != 0 print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask))) lasso_CV_coefs = dict(zip(lasso_CV.coef_.round(4), X_sc.columns)) print("{0:.1} LassoCV R^2 on test set.".format(lasso_CV.score(X_test, y_test))) mse = MSE(y_test, lasso_CV.predict(X_test)) print("{} LassoCV RFE RMSE on test set.".format(mse**0.5)) #3.4 Gradient Boosting gbr = GBR(n_estimators=250) rfe = RFE(estimator=gbr, n_features_to_select=5, step=2, verbose=0) rfe.fit(X_train, y_train) gbr_mask = rfe.support_ print("{0:.1} GBR RFE R^2 on test set.".format(rfe.score(X_test, y_test))) mse = MSE(y_test, rfe.predict(X_test)) print("{} GBR RFE RMSE on test set.".format(mse**0.5))
y_val = map(lambda x: int(x), y_test) test_pred = map(lambda x: int(x), test_pred) print("="*60) print("Logistic Regression with L1-based feature selection(0.001)") print("="*60) print(classification_report(y_test, test_pred)) print('Accuracy: %s\n' % accuracy_score(test_pred, y_test)) ######################## Logistic Regression with L1-based feature selection (CV) ################################ features = tfidf.fit_transform(X_train).toarray() lassocv = LassoCV() lassocv.fit(features, y_train) alpha = lassocv.alpha_ lor_l1 = Pipeline([ ('vect', tfidf), ('tfidf', TfidfTransformer()), ('predict', Lasso(alpha=alpha, max_iter=15)), ]) lor_l1.fit(X_train, y_train) test_pred = lor_l1.predict(X_test) y_val = map(lambda x: int(x), y_test) test_pred = map(lambda x: int(x), test_pred) print("="*60)
# for alpha in alphas: # clf = Lasso(alpha) # test_score = np.sqrt(-cross_val_score(clf, X_train, sale_price, cv=10, scoring='neg_mean_squared_error')) # test_scores.append(np.mean(test_score)) # plt.plot(alphas, test_scores) # plt.title("Alpha vs CV Error") # plt.show() # Parameter tuning (alpha) for Lasso regression - method 2 # Re-write this part (inspired from kernel) lscv = LassoCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True, max_iter=5000, n_alphas=1000, n_jobs=1, normalize=False, positive=False, precompute='auto', random_state=None, selection='cyclic', tol=0.0001, verbose=False) lscv.fit(X_train, sale_price) # the best alpha is lscv.alpha_ # print (lscv.alpha_) # gives 0.000482366054317 lasso = Lasso(alpha=lscv.alpha_, max_iter=5000) lasso.fit(X_train, sale_price) # Since the model is trained on log of sale price, back transforming it for output y_lasso = np.exp(lasso.predict(X_test)) print y_lasso y_final = (0.7 * y_ridge) + (0.3 * y_lasso) final_output = pd.DataFrame(data= {'Id' : test.index, 'SalePrice': y_final}) print final_output.head(10) final_output.to_csv('output_ridge.csv', index=False)
training_data.loc[training_data['smoker'] == 'yes', 'smoker'] = 1 training_data.loc[training_data['smoker'] == 'no', 'smoker'] = 0 training_data.loc[:, 'northeast'] = training_data['region'] == 'northeast' training_data.loc[:, 'northwest'] = training_data['region'] == 'northwest' training_data.loc[:, 'southeast'] = training_data['region'] == 'southeast' training_data.loc[:, 'southwest'] = training_data['region'] == 'southwest' del training_data['region'] x_train = training_data[['age', 'is_male', 'bmi', 'children', 'smoker', 'northeast', 'northwest', 'southeast', 'southwest']]\ .to_numpy(dtype=np.float64) y_train = training_data['charges'].to_numpy(dtype=np.float64) model = LassoCV(normalize=True) model.fit(x_train, y_train) raw_test_data = pd.read_csv('public_dataset/test_sample.csv') test_data = raw_test_data.copy() test_data.loc[test_data['sex'] == 'female', 'sex'] = 0 test_data.loc[test_data['sex'] == 'male', 'sex'] = 1 test_data.rename(columns={'sex': 'is_male'}, inplace=True) test_data.loc[test_data['smoker'] == 'yes', 'smoker'] = 1 test_data.loc[test_data['smoker'] == 'no', 'smoker'] = 0 test_data.loc[:, 'northeast'] = test_data['region'] == 'northeast' test_data.loc[:, 'northwest'] = test_data['region'] == 'northwest' test_data.loc[:, 'southeast'] = test_data['region'] == 'southeast' test_data.loc[:, 'southwest'] = test_data['region'] == 'southwest'
def get_keywords_with_regression(self, random_seed=923, ngram_range=(1, 1), min_df=0.01, max_df=0.85, apply_smote=True): """ Get the keywords using LASSO's feature selection technique """ # a dataframe of all the reviews df = pd.DataFrame({"review_text": self.corpus, "labels": self.labels}) # Get positive Examples pos_df = df.loc[df['labels'] == 1, :].reset_index(drop=True) # Get negative examples neg_df = df.loc[df['labels'] == 0, :].reset_index(drop=True) # get the shape pos_count = pos_df.shape[0] neg_count = neg_df.shape[0] if min([pos_count, neg_count]) < 100: print('Warning: Number of minority class less than 100') if min([pos_count, neg_count]) / max([pos_count, neg_count]) < 0.333: print("Class imbalance detected") # initialize the variables #salient_terms = dict() # removed the iteration df_combine = pos_df.append(neg_df) df_combine.reset_index(drop=True, inplace=True) target = df_combine['labels'] # Fit the TFidf vectorizer vec = TfidfVectorizer(ngram_range=ngram_range, tokenizer=self.tokenizer, min_df=min_df, max_df=max_df) vec_f = vec.fit(df_combine['review_text']) # Create the traiining document-term matrix train_dtm = vec_f.transform(df_combine['review_text']) if apply_smote: # Apply SMOTE to fix the class imbalance problem sm = SMOTE(random_state=923, ratio=1) X_input, y_input = sm.fit_resample(train_dtm, target) else: X_input, y_input = train_dtm, target """ Modeling """ lasso = LassoCV(cv=5) lasso_f = lasso.fit(X_input, y_input) """ Construct the coeficient table """ coef_table = pd.DataFrame({ "feature": vec_f.get_feature_names(), "coef": lasso_f.coef_ }) # Select only the positive coefficients key_terms_df = coef_table[['feature', 'coef']][coef_table['coef'] > 0] key_terms_df.reset_index(drop=True, inplace=True) """ Next find the DF for the terms """ # Get the index for each of the terms with positive label train_feature_tb = pd.DataFrame(train_dtm.toarray()) train_feature_tb.columns = vec_f.get_feature_names() dtm_key_terms = train_feature_tb[list(key_terms_df['feature'])] for term in dtm_key_terms: dtm_key_terms[term] = [x != 0 for x in dtm_key_terms[term]] # replace the vectors in the tf-idf table with a boolean vector frequency_count = pd.DataFrame(dtm_key_terms.sum(axis=0)).reset_index() frequency_count.columns = ['feature', 'count'] frequency_count[ 'prop'] = frequency_count['count'] / dtm_key_terms.shape[0] # Join the two tables on features to get all the info needed significant_terms = pd.merge(key_terms_df, frequency_count, how="left", on="feature") """ As a last step, let's tag each term and separate into Nouns, Verbs and Adjs """ class output: significant_terms_tb = significant_terms dtm_boolean = dtm_key_terms #salient_terms_dict = salient_terms return output
#miss_cal = test_data[quality].isnull().sum().sort_values(ascending=False) #miss = miss_cal[miss_cal>0].index #test_data[miss] = test_data[miss].fillna(test_data[miss].mean()) #print test_data[miss].isnull().sum() #print train_data_x.shape #print train_data_y #print test_data[np.isnan(test_data).any] #line_reg = LinearRegression()#0.508 #line_reg.fit(new_tr_x,train_data_y) re1 = list(test_data['Id']) ind = test_data['Id'] train_data_x = train_data_x.drop('Id', axis=1) test_data = test_data.drop('Id', axis=1) las = LassoCV() las.fit(new_tr_x, train_data_y) para = { 'n_alphas': [i for i in range(50, 200)], 'max_iter': [j for j in range(500, 2000)] } clt = GridSearchCV(las, param_grid=para) clt.fit(train_data_x, train_data_y) test_data_y3 = clt.predict(test_data) #train_data_x = preprocessing.normalize(train_data_x) #test_data = preprocessing.normalize(test_data) boostreg = GradientBoostingRegressor( ) #learning_rate=0.016,n_estimators=1000)#0.161 #boostreg.fit(new_tr_x, train_data_y) #train_data_y = np.log1p(train_data_y) boostreg.fit(train_data_x, train_data_y) #my_pip = RandomForestRegressor()#0.172
def yuchuli(file_path): global special alphaslist = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100, 1000, 10000] # alphas列表 df = pd.read_excel(file_path) head_list = df.columns.values special_head = head_list[special] drop_index.remove(special) dataset = df.drop(df.columns[drop_index], axis=1) new_head_list = dataset.columns.values new_special = list(new_head_list).index(special_head) special = new_special # 获取筛选列过后的数据集中特征所在列的下标 # h = 13 # 第几个城市 # dataset = dataset[20 * h - 20:20 * h] dataset = dataset.fillna(0.1) y = dataset.iloc[:, special] x = dataset.drop(dataset.columns[[special]], axis=1) po = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) x_poly = po.fit_transform(x) x_change = pd.DataFrame(x_poly, columns=po.get_feature_names()) labels = x_change.columns X_train, X_test, Y_train, Y_test = train_test_split(x_change, y, test_size=0.2, random_state=1) model2 = LassoCV(alphas=alphaslist) # 导入模型传入参数alpha=0.1 model2.fit(X_train, Y_train) # 训练数据 # model2 = Lasso(max_iter = iterations, alphas=1).fit(X_train,Y_train) index = model2.coef_ newindex = [] if len(labels) == len(index): for i in range(len(index)): if index[i] >= 0.00001 and i != 0: # 选择系数值绝对值大于等于0.00001的系数 pass else: pass if abs(index[i]) >= 0.00001: newindex.append(i) ypre = model2.predict(X_test) x_changenew = x_change.iloc[:, newindex] # 形成新的dataset # 选择新的特征系数重新进行线性回归————————————————————————————————————————————————————————————————————————————————————-——— X_train, X_test, Y_train, Y_test = train_test_split(x_changenew, y, test_size=0.2, random_state=1) model2 = RidgeCV(alphas=alphaslist) # 导入模型传入参数alpha=0.1 model2.fit(X_train, Y_train) # 训练数据 # model2 = Lasso(max_iter=iterations).fit(X_train, Y_train) index = model2.coef_ gongshi = [] for i in range(len(index)): if index[i] >= 0 and i != 0: gongshi.append('+') gongshi.append('%.5f' % index[i] + labels[i]) return ypre, Y_test, gongshi
print('Linear Regression RMSE:%0.5f'%np.sqrt(metrics.mean_squared_error(y_train,y_pred_lr))) print('Linear Regression R^2: %0.5f' %metrics.explained_variance_score(y_train,y_pred_lr)) print ("predict time:", round(time.time()-t1, 3), "s") # Store Standard Error se_lr = stats.sem(y_pred_lr) """<h2 id="4.-Lasso-Regression-model">4. Lasso Regression model<a class="anchor-link" href="#4.-Lasso-Regression-model">¶</a></h2><hr/> #### For Train mse """ # Initialise Lasso Regression model lcv = LassoCV() t0=time.time() lcv.fit(X_train, y_train) # Make Prediction t1=time.time() y_pred_lcv = lcv.predict(X_test) # Return Results print('Lasso Regression MAE: %0.5f'%metrics.mean_absolute_error(y_test,y_pred_lcv)) print('Lasso Regression MSE:%0.5f'%metrics.mean_squared_error(y_test,y_pred_lcv)) print('Lasso Regression RMSE:%0.5f'%np.sqrt(metrics.mean_squared_error(y_test,y_pred_lcv))) print('Lasso Regression R^2: %0.5f' %metrics.explained_variance_score(y_test,y_pred_lcv)) print ("fitting time:", round(time.time()-t0, 3), "s") print ("predict time:", round(time.time()-t1, 3), "s") se_lcv = stats.sem(y_pred_lcv)
y_pred1=LR.predict(scaler.transform([[300,110,5,5,5,10,1]])) # In[31]: y_pred1 # In[32]: # Lasso Regularization # LassoCV will return best alpha and coefficients after performing 10 cross validations lasscv = LassoCV(alphas = None,cv =10, max_iter = 100000, normalize = True) lasscv.fit(x_train, y_train) # In[33]: # best alpha parameter alpha = lasscv.alpha_ alpha # In[34]: #now that we have best parameter, let's use Lasso regression and see how well our data has fitted before
#%% LASSO """ Lasso Regression : The cost function for Lasso (least absolute shrinkage and selection operator) regression can be written as Just like Ridge regression cost function, for lambda =0, the equation above reduces the equation. The only difference is instead of taking the square of the coefficients, magnitudes are taken into account. This type of regularization (L1) can lead to zero coefficients i.e. some of the features are completely neglected for the evaluation of output. So Lasso regression not only helps in reducing over-fitting but it can help us in feature selection. Just like Ridge regression the regularization parameter (lambda) can be controlled. """ #%% LASSO CV test different alphas lass_cv = LassoCV(alphas=[1, 0.7, 0.5, 0.3, 0.1, 0.01, 0.001, .0005, .0001, .00001], cv=3, tol=.001, random_state=67, normalize=True) # fit to data lass_cv.fit(X_sample, y_sample) # score based off of test data lass_cv.score(X_sample, y_sample) # best alpha lass_cv.alpha_ #%% Test more alphas based off of last cell lass_cv = LassoCV(alphas=[.008, .009, .0001, .0002, .0003, .0004], cv=3, random_state=67, normalize=True) # fit to data lass_cv.fit(X_sample, y_sample) # score based off of test data lass_cv.score(X_sample, y_sample)
def reg_pro_aprobacion(): datos = pd.read_csv('Datos/Datos_MEN.csv', header=0) # obtenemos la columna año de los datos datos['AÑO'] = datos['AÑO'].astype('int64') # totamos solo los valos de la columna X = datos['AÑO'].values X = X[:, np.newaxis, ] # obtenemos las columna de Aprobacion de los datos y = datos['APROBACIÓN_MEDIA'].values # Aprobacion media X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) # Se crean los modelos lineales lr = LinearRegression() rgl = LassoCV(cv=4) rgr = RidgeCV(alphas=[0.1, 0.2, 0.5, 1.0, 3.0, 5.0, 10.0]) # #entreno el modelo # Aprobacion media lr.fit(X_train, y_train) rgl.fit(X_train, y_train) rgr.fit(X_train, y_train) # realizamos la prediccion # Aprobacion media y_pred = lr.predict(X_test) y_predrgl = rgl.predict(X_test) y_predrgr = rgr.predict(X_test) # periodos a proyectar ----------------------------------------------------------------------------- año_actual = datetime.datetime.now().year periodos = [ año_actual, año_actual + 1, año_actual + 2, año_actual + 3, año_actual + 4 ] periodos = np.reshape(periodos, (-1, 1)) # Aprobacion media y_pro2 = lr.predict(periodos) y_prorgl2 = rgl.predict(periodos) y_prorgr2 = rgr.predict(periodos) print('DATOS DEL MODELO DE REPROBACION') print('Regresión Mínimos Cuadrados Ordinarios') # Coeficiente # m_coe =lr.coef_ print('Coeficientes:', lr.coef_) # MSE # m_mse = "{0:.4f}".format(np.mean((lr.predict(X_test) - y_test) ** 2)) print("Residual sum of squares: %.2f" % np.mean( (lr.predict(X_test) - y_test)**2)) # Varianza explicada # m_ve = "{0:.4f}".format(abs(lr.score(X_test,y_test))) print('Varianza explicada: %.2f\n' % lr.score(X_test, y_test)) print('Regresión Lasso') # Coeficiente # l_coe = rgl.coef_ print('Coeficientes:', rgl.coef_) # MSE # l_mse = "{0:.4f}".format(np.mean((rgl.predict(X_test) - y_test) ** 2)) print("Residual sum of squares: %.2f" % np.mean( (rgl.predict(X_test) - y_test)**2)) # Varianza explicada # l_ve = "{0:.4f}".format(abs(rgl.score(X_test, y_test))) print('Varianza explicada: %.2f\n' % rgl.score(X_test, y_test)) print('Regresión Ridge') #Coeficiente # r_coe = rgr.coef_ print('Coeficientes:', rgr.coef_) # MSE # r_mse = "{0:.4f}".format((np.mean(rgr.predict(X_test) - y_test) ** 2)) print("Residual sum of squares: %.2f" % np.mean( (rgr.predict(X_test) - y_test)**2)) # Varianza explicada # r_ve = "{0:.4f}".format(abs(rgr.score(X_test,y_test))) print('Varianza explicada: %.2f\n' % rgr.score(X_test, y_test)) fig1 = plt.figure(figsize=(12, 8), dpi=120) fig1.subplots_adjust(hspace=0.5, wspace=0.5) ax = fig1.add_subplot(2, 1, 1) ax.scatter(X, y) ax.set_xlabel('Periodos') ax.set_ylabel('Tasa de Aprobación') ax.set_title('Grafica de Aprobación') ax2 = fig1.add_subplot(2, 1, 2) ax2.scatter(X_test, y_test, color='black') ax2.plot(X_test, y_pred, color='blue', linewidth=3, label=u'Regresión MCO') ax2.plot(X_test, y_predrgl, color='yellow', linewidth=3, label=u'Regresión Lasso') ax2.plot(X_test, y_predrgr, color='green', linewidth=3, label=u'Regresión Ridge') ax2.set_title(u'Regresión de Aprobación Media por 3 metodos diferentes') ax2.set_xlabel('Periodos') ax2.set_ylabel('Regresión de Aprobación Media') # plt.legend() # ax2.set_xticks(()) # ax2.set_yticks(()) ruta_reg = "static/file/regresion_aprobacion.png" fig1.savefig(ruta_reg) # plt.show() fig01 = plt.figure(figsize=(12, 8), dpi=120) fig01.subplots_adjust(hspace=0.5, wspace=0.5) ax = fig01.add_subplot(1, 1, 1) ax.scatter(periodos, y_pro2, color='blue') ax.scatter(periodos, y_prorgl2, color='yellow') ax.scatter(periodos, y_prorgr2, color='green') # ax2.plot(periodos, y_pred, color='blue',linewidth=3, label=u'Regresión MCO') # ax2.plot(periodos, y_predrgl, color='yellow',linewidth=3, label=u'Regresión Lasso') # ax2.plot(periodos, y_predrgr, color='green',linewidth=3, label=u'Regresión Ridge') ax.set_title(u'Proyeccion de los siguentes 5 Periodos Escolares') ax.set_xlabel('Periodos') ax.set_ylabel('Proyeccion Aprobación Media') ruta_pro = "static/file/pro_aprobacion.png" fig01.savefig(ruta_pro) return ruta_reg, ruta_pro
def pert_test(clusters, energies, counts, comps, noise=0.1, Normalize=True, Intercept=True, Energy_above_hull = True, name=''): fold_pick = 10 lasso_coefs = [] ridge_coefs = [] linreg_coefs = [] counts = np.array(counts) energies = np.array(energies) ###- scale to energy above hull -### if Energy_above_hull == True: y1 = min(energies) y2 = max(energies) x2 = min(comps) x1 = max(comps) scale = lambda x0, y0, x1, y1, x2, y2: abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1) / np.sqrt( (y2 - y1) ** 2 + (x2 - x1) ** 2) energies = [scale(comps[i], energies[i], x1, y1, x2, y2) for i in range(len(energies))] ###- Set up output file -### file_out = 'pert_summery.txt' file = open(file_out, 'w') ###- Set up alphas for CV -### alpha_range = [-10, 10] alpha_lasso = np.logspace(alpha_range[0], alpha_range[1], num=1000) n_alphas = 1010 alpha_ridge = np.logspace(-15, 10, n_alphas) # LASSO and RIDGE, Cross-Validation, Lin Reg without CV lassocv = LassoCV(alphas=alpha_lasso, normalize=Normalize, fit_intercept=Intercept, cv=fold_pick, max_iter=1e5) ridgecv = RidgeCV(alphas=alpha_ridge, normalize=Normalize, fit_intercept=Intercept, cv=None, store_cv_values=True) linreg = LinearRegression(fit_intercept=Intercept, normalize=Normalize) # Fit to data for each method noise = np.linspace(0.001,1,25) lasso_vars = [[] for _ in range(len(clusters))] for n in noise: lasso_coefs = [] ridge_coefs = [] linreg_coefs = [] lassocv.fit(counts, energies) lasso_coefs.append(lassocv.coef_) ridgecv.fit(counts, energies) ridge_coefs.append(ridgecv.coef_) linreg.fit(counts, energies) linreg_coefs.append(linreg.coef_) for i in range(100): data_noise = np.random.normal(0, n, counts.shape) counts_new = counts + data_noise data_noise = np.random.normal(0, n, energies.shape) energies_new = energies + data_noise lassocv.fit(counts_new, energies_new) lasso_coefs.append(lassocv.coef_) ridgecv.fit(counts_new, energies_new) ridge_coefs.append(ridgecv.coef_) linreg.fit(counts_new, energies_new) linreg_coefs.append(linreg.coef_) lasso_coefs = np.array(lasso_coefs) ridge_coefs = np.array(ridge_coefs) linreg_coefs = np.array(linreg_coefs) for i in range(len(clusters)): data = np.transpose(lasso_coefs[:, i]) var = data.var() lasso_vars[i].append(var) for i in range(len(lasso_vars)): plt.plot(noise,lasso_vars[i]) file.close() return
predictors = df_train.drop([target], axis=1) X_train, X_test, y_train, y_test = train_test_split(predictors, train_target, train_size=1 - tam_test, test_size=tam_test, random_state=0) X_pred = df_pred # In[]: Lasso Model lasso = LassoCV(alphas=[ 0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60, 100 ], max_iter=50000, cv=10) lasso.fit(X_train, y_train) alpha = lasso.alpha_ print("Best alpha :", alpha) print("Try again for more precision with alphas centered around " + str(alpha)) lasso = LassoCV(alphas=[ alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4 ], max_iter=50000, cv=10) lasso.fit(X_train, y_train) alpha = lasso.alpha_ print("Best alpha :", alpha)
class MovieTrainer(object): def __init__(self,training_file,test_file): self._training_pickle=training_file self._test_pickle=test_file #to be defined later self._list_of_dicts=None self._dataframe=None self._features=None self._test_features=None self._labels=None self._clf=None self._training_frame=None self._test_frame=None self._prediction_frame=None #dicts self._actor_dict=None self._director_dict=None self._genre_dict=None self._production_house=None def _load_dataframe(self): if os.path.isfile(self._training_pickle) ==True: self._training_dict=pickle.load(file(self._training_pickle)) else: raise AttributeError("Cannot find pickle file:%s"%self._training_pickle) if os.path.isfile(self._test_pickle) ==True: self._test_dict=pickle.load(file(self._test_pickle)) else: raise AttributeError("Cannot find pickle file:%s"%self._test_pickle) #load pandas frame self._training_frame=pd.DataFrame(self._training_dict) self._test_frame=pd.DataFrame(self._test_dict) #drop movies with no names self._training_frame.dropna(subset=["moviename"]) self._test_frame.dropna(subset=["moviename"]) return #raise error? def _addtodict(self,name,this_dict): if this_dict.has_key(name): this_dict[name]+=1 else: this_dict[name]=1 return def _modify_string(self,playername): playername = re.sub('^\s+|\s+$','', playername) playername=re.sub('\s+','_',playername) playername=re.sub('\*','',playername) return playername #this function creates a list of features #corresponding to the most frequent actors #in a movie def _create_playerdict(self,frame,colname,num_features): playerdict={} for index in frame.index: #for each row, we have list of actors #like ['Sandra Bullock', 'Melissa McCarthy'] playerlist=frame.ix[index,colname] if type(playerlist)!=float: #only actors have multiple list members, other players #like director don't if colname=="actors": for playername in playerlist: #remove spaces, *, leading trailing spaces playername=self._modify_string(playername) self._addtodict(playername,playerdict) else: playerlist=self._modify_string(playerlist) self._addtodict(playerlist,playerdict) counter=0 feature_list=[] #sort the dict to get players with highest number of movies for key,value in sorted(playerdict.items(),key=lambda x:x[1],reverse=True): #print key,value feature_list.append(key) counter+=1 if counter>num_features: break return feature_list #this function returns a value of the player features for #each movie def _create_player_features(self,frame,colname,num_features): #feature_list is all names of players with most movies feature_list=self._create_playerdict(frame,colname,num_features) actor_frame = pd.DataFrame() for player in feature_list: feature_name=colname+":"+player actor_frame[feature_name]=pd.Series(0,index=frame.index) bigplayer_name="feature:big_"+colname #TODO: take out of loop actor_frame[bigplayer_name]=pd.Series(0,index=frame.index)#big actors directors present or not? for index in frame.index: playerval=frame.ix[index,colname] if type(playerval)!=float: #playerval is not None if colname=="actors": for actor in playerval: actor=self._modify_string(actor) if actor in feature_list: thisfeature=colname+":"+actor actor_frame.loc[index,thisfeature]=1 else: playerval=self._modify_string(playerval) if playerval in feature_list: thisfeature=colname+":"+playerval actor_frame.loc[index,thisfeature]=1 actor_frame.loc[index,bigplayer_name]=1 else: actor_frame.loc[index,bigplayer_name]=0 return actor_frame def _create_theater_features(self,frame): #add feature column theater_frame=pd.DataFrame() theater_frame["feature:num_theaters"]=pd.Series(0,index=frame.index) for index in frame.index: theater_list=frame.ix[index,"theater_list"] if type(theater_list)==list and len(theater_list)>0: theater=theater_list[0] theater=re.sub(',','',theater) if re.search('\d+',theater) is not None: theater_frame.loc[index,"feature:num_theaters"]=int(theater) else: theater_frame.loc[index,"feature:num_theaters"]=0 else: theater_frame.loc[index,"feature:num_theaters"]=0 return theater_frame def _first_weekend_rank(self,frame): #todo: try to merge with create theater features weekend_frame = pd.DataFrame() weekend_frame["feature:rank"]=pd.Series(0,index=frame.index) for index in frame.index: rank_list=frame.ix[index,"rank_list"] if type(rank_list)==list and len(rank_list)>0: rank=rank_list[0] rank=re.sub(',','',rank) if re.search('\d+',rank) is not None: weekend_frame.loc[index,"feature:rank"]=int(rank) else: weekend_frame.loc[index,"feature:rank"]=1000#some large number? or zero? else: weekend_frame.loc[index,"feature:rank"]=1000 return weekend_frame def _create_running_time_feature(self,frame): runtime_frame = pd.DataFrame() runtime_frame["feature:runtime"]=pd.Series(0,index=frame.index) for index in frame.index: running_time=frame.ix[index,"runtime"] if type(running_time)!= float: #not NaN pattern='(\d+).+\s(\d+)' hrmin=re.match(pattern,running_time) if hrmin is not None: hrs=hrmin.group(1) mins=hrmin.group(2) tot_time=int(hrs)*60+int(mins) runtime_frame.loc[index,"feature:runtime"]=tot_time else: runtime_frame.loc[index,"feature:runtime"]=0 else: runtime_frame.loc[index,"feature:runtime"]=0 return runtime_frame def _create_release_date_feature(self,frame): monthlist=["January","February","March","April","May","June"\ "July","August","September","October","November","December"] month_frame = pd.DataFrame() for month in monthlist: feature_name="feature:release_"+month month_frame[feature_name]=pd.Series(0,index=frame.index) for index in frame.index: release_date=frame.ix[index,"release_date"] if type(release_date)!=float: pattern='(\S+)\s(\d+)' monthday=re.match(pattern,release_date) if monthday is not None: month=monthday.group(1) day=monthday.group(2) if month in monthlist: thisfeature="feature:release_"+month month_frame.loc[index,thisfeature]=1 return month_frame def _extract_features(self,frame,isTraining=True): """ extracts features from training and test frame all major data munging, cleaning takes place here """ #pass #we will make clean_frame as the data frame, #then we will define the training/test frame #and add each feature as a dataframe #and finally concatenate the features #check if labels exist for these movies clean_data=frame[pd.notnull(frame["domestic_gross"])] list_of_frames=[] #no of theaters it opened at in the first week #keep this as first feature so that you can plot using this list_of_frames.append( self._create_theater_features(clean_data) ) print "Created Theater Feature..." list_of_frames.append( self._first_weekend_rank(clean_data) ) print "Created Rank Feature..." list_of_frames.append( self._create_running_time_feature(clean_data) ) print "Created running time Feature..." list_of_frames.append( self._create_release_date_feature(clean_data) ) print "Created release date Feature..." #create player features list_of_frames.append( \ self._create_player_features(clean_data,"actors",5) ) list_of_frames.append( \ self._create_player_features(clean_data,"director",5) ) list_of_frames.append( self._create_player_features(clean_data,"distributor",5) ) list_of_frames.append( self._create_player_features(clean_data,"genre_toplist",5) ) list_of_frames.append( self._create_player_features(clean_data,"mpaa_rating",5) ) print "Created player Features..." #check dataframe shapes for frames in list_of_frames: assert frames.shape[0] == clean_data.shape[0] #concatenate the dataframes final_frame = pd.concat(list_of_frames,axis = 1) final_frame.to_csv("Training/training_frame.csv") #get training labels if isTraining == True: labels_arr=self._extract_labels(clean_data) else: prediction_frame=clean_data[["moviename","genre_toplist","actors"]] n_samples=len(final_frame.index) n_features=len(final_frame.columns) #from Dataframe to numpy array feature_arr=final_frame.values.reshape(n_samples,n_features) print "Created All Features....." if isTraining is True: return feature_arr,labels_arr else: return feature_arr,prediction_frame #plt.plot(theater_arr,self._clf.predict(theater_arr),'r-',linewidth=2) #plt.show() return def _extract_labels(self,frame): df_Y=frame["domestic_gross"].values gross_list=df_Y.tolist() for i in range(len(gross_list)): gross_list[i]=int(gross_list[i]) max_gross=np.max(gross_list) #print max_gross gross_list=[x/max_gross for x in gross_list] n_samples=len(gross_list) gross_arr=np.array(gross_list).reshape(n_samples,1) return gross_arr def _get_top_actors(self,actorlist): top_actors=[None,None,None] if type(actorlist) ==float: return top_actors; counter=0 for actor in actorlist: top_actors[counter]=self._modify_string(actor) counter+=1 if counter==2: break return top_actors def explore_data(self): """ plots and prints various kinds of stuff to test out the data change, comment and uncomment here directly """ if self._training_frame is None: self._load_dataframe() #col_list.remove('actors') #print col_list #self._training_frame.drop(col_list,axis=1,inplace=True) #print self._training_frame.ix[500:510] #print len(self._training_frame.index) #only_budget=self._training_frame[pd.isnull(self._training_frame["domestic_gross"])] #print len(only_budget.index) #actors_there=self._training_frame[pd.notnull(self._training_frame["actors"])] #print len(actors_there.index) #print actors_there.head() #director_there=self._training_frame[pd.notnull(self._training_frame["director"])] #print len(director_there.index) #print director_there.head() pass def top_5_genres(self): if self._training_frame is None: self._load_dataframe() genre_list=self._create_playerdict(self._training_frame,"genre_toplist",5) print genre_list def train_2013(self): #pass self._load_dataframe() self._training_frame.to_csv("Training/raw_frame.csv") total_features,total_labels=self._extract_features(self._training_frame,isTraining=True) total_labels=np.ravel(total_labels) print type(total_features) print type(total_labels) #create train and test split self._features, test_features, self._labels, test_labels =\ train_test_split(total_features, total_labels, test_size = 0.33) print self._features.shape print self._labels.shape print test_features.shape print test_labels.shape cv_outer = KFold(self._labels.shape[0],n_folds=5) self._clf = LassoCV(eps=0.01, n_alphas=10,cv =5) cross_val_arr=cross_val_score(self._clf,self._features,self._labels,cv=cv_outer) print "Finished Training....." r_sq=np.mean(cross_val_arr) print "R Square for training set: ",r_sq self._clf.fit(self._features,self._labels) plt.plot(test_labels, self._clf.predict(test_features),'ro',linewidth=2) plt.plot(np.arange(0,1.,.1),np.arange(0,1.,.1),'b-',linewidth=2) plt.xlabel("Actual Gross") plt.ylabel("Predicted Gross") plt.show() def test_2014(self): #check if already trained if self._clf is None: self.train_2013() print "Generating Test Features..." self._test_features,self._prediction_frame=self._extract_features(\ self._test_frame,isTraining=False) self._prediction_frame["prediction"]=self._clf.predict(self._test_features) print "Finished Testing..." #sanity check and normalize self._prediction_frame["prediction"]=self._prediction_frame["prediction"].apply(\ lambda x: 0 if x<0 else x) maxpred=self._prediction_frame["prediction"].max() if maxpred>1: self._prediction_frame["prediction"]=self._prediction_frame["prediction"].apply(\ lambda x: x/maxpred) print self._prediction_frame.head() def save_db(self,filename): con=sqlite3.connect(filename) cursor=con.cursor() cursor.execute('DROP TABLE IF EXISTS currentmovies') cursor.execute('CREATE TABLE currentmovies(\ moviename VARCHAR(255) ,\ genre VARCHAR(255),\ prediction INT,\ actor1 VARCHAR(255),\ actor2 VARCHAR(255),\ actor3 VARCHAR(255))') for index in self._prediction_frame.index: movname=self._prediction_frame.ix[index]["moviename"].encode('utf-8') pred=self._prediction_frame.ix[index]["prediction"] genre=self._prediction_frame.ix[index]["genre_toplist"].encode('utf-8') (actor1,actor2,actor3)=self._get_top_actors(self._prediction_frame.ix[index]["actors"]) if type(movname)==float and math.isnan(movname)==True: continue print movname,genre,pred cursor.execute('INSERT INTO currentmovies\ VALUES(?,?,?,?,?,?)',(movname,genre,pred,actor1,actor2,actor3)) con.commit() con.close()
def select(self,X,y,weight): lm = LassoCV(cv=self.cv,normalize=False,max_iter=2000) lm.fit(X,y) f_indices = np.argwhere(lm.coef_ != 0).T[0] return f_indices
lassoreg.fit(X_train, y_train) print lassoreg.coef_ # calculate RMSE (for alpha=0.01) y_pred = lassoreg.predict(X_test) print np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # - [LassoCV](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html): lasso regression with built-in cross-validation of the alpha parameter # - **n_alphas:** number of alpha values (automatically chosen) to try # select the best alpha with LassoCV from sklearn.linear_model import LassoCV lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1) lassoregcv.fit(X_train, y_train) lassoregcv.alpha_ # examine the coefficients print lassoregcv.coef_ # predict method uses the best alpha value y_pred = lassoregcv.predict(X_test) print np.sqrt(metrics.mean_squared_error(y_test, y_pred)) # ## Part 5: Regularized classification in scikit-learn # # - Wine dataset from the UCI Machine Learning Repository: [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data), [data dictionary](http://archive.ics.uci.edu/ml/datasets/Wine)
# In[11]: X = df.iloc[:, :-1].values y = df.iloc[:, -1].values xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.15) # In[12]: cv = RepeatedKFold(n_splits = 5, n_repeats=3) lasso = LassoCV(alphas=None, cv = cv, max_iter = 100000) lasso.fit(xtrain, ytrain) # In[13]: ypred = lasso.predict(xtest) r2 = r2_score(ytest, ypred) print('R2 Score: ', r2) score = lasso.score(xtrain, ytrain) print("R-squared:", score) # In[14]:
def train_and_analyse(_X, _y, features): X = _X Y = _y cv_l = cross_validation.KFold(X.shape[0], n_folds=10, shuffle=True, random_state=1) ranks = {} lr = LinearRegression(normalize=True) lr.fit(X, Y) ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features) ridge = RidgeCV(cv=cv_l) ridge.fit(X, Y) ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features) # Run the RandomizedLasso: we use a paths going down to .1*alpha_max # to avoid exploring the regime in which very noisy variables enter # the model lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000) lasso.fit(X, Y) ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features) rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42) rlasso.fit(X, Y) ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features) rfe = RFE(lr, n_features_to_select=1) rfe.fit(X,Y) ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1) rf = RandomForestRegressor(n_estimators=500) rf.fit(X,Y) ranks["RF"] = rank_to_dict(rf.feature_importances_, features) f, pval = f_regression(X, Y, center=True) ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features) mine = MINE() mic_scores = [] for i in range(X.shape[1]): mine.compute_score(X[:,i], Y) m = mine.mic() mic_scores.append(m) ranks["MIC"] = rank_to_dict(mic_scores, features) r = {} for name in features: r[name] = round(np.mean([ranks[method][name] for method in ranks.keys()]), 2) methods = sorted(ranks.keys()) ranks["Mean"] = r methods.append("Mean") ranks = pd.DataFrame(ranks) selection_feature = ranks[ranks.Mean > 0.12].index.values return ranks, selection_feature
def fit(self, X, y): LassoCV.fit(self, X, y) pred = self.predict(X) self.sigma2_ = np.mean((y - pred)**2)
def SolveJointRegression(X1, X2, Y1, Y2, fused=True, s1=15, s=2): mean_Y1_s = Y1.mean() mean_Y2_s = Y2.mean() sigma1 = (Y1 - mean_Y1_s).dot( Y1 - mean_Y1_s ) / n1 #unbiased estimate of variance of the noise variance of feature being regressed on TODO: make sigma a subfunction with options Sigma1 = np.dot( X1[:, regressor_ind].T, X1[:, regressor_ind]) / n1 #empirical covariance matrix for the X_s^c sigma2 = (Y2 - mean_Y2_s).dot(Y2 - mean_Y2_s) / n2 Sigma2 = np.dot(X2[:, regressor_ind].T, X2[:, regressor_ind]) / n2 Ys = [Y1, Y2] Xs = [X1[:, regressor_ind], X2[:, regressor_ind]] lam_L1 = np.sqrt(sigma1) * np.sqrt( 2. * (np.log(n_vars - 1)) / n1) * 2 #order sigma*sqrt(log(p)/n) alphas = np.array([0.1, 0.5, 1., 5., 10., 50., 100.]) * lam_L1 if fused: lam_L2 = np.sqrt(sigma2) * np.sqrt(2. * (0.01 + np.log(n_vars - 1)) / n2) lam_F = np.sqrt(sigma2) * np.sqrt((np.log((n_vars - 1)) / (n2))) fm_cv = FusedMultiTaskLassoCV(cv=5, gammas=np.array([0.1, 1., 10., 100.]) * lam_F, minLam=0.1 * lam_F, n_jobs=n_jobs) fm_cv.fit(Xs, Ys) coefs = np.reshape(fm_cv.coefs, (2, n_vars - 1)).T mu1 = 4. * (fm_cv.alpha_ * s1 + fm_cv.gamma * s) mu1 = (1. / mu1) / (n2**(0.01)) mu = 2. * (fm_cv.alpha_) * s mu = (1. / mu) / (n2**(0.01)) Theta1, Theta2 = DecorrInverseCovDiff(X1[:, regressor_ind], X2[:, regressor_ind], mu=mu, mu1=mu1) else: #Two debiased lasso coefs = np.zeros((n_vars - 1, 2)) ls_cv = LassoCV(cv=5, fit_intercept=False, alphas=alphas) ls_cv.fit(X1[:, regressor_ind], Y1) coefs[:, 0] = ls_cv.coef_ lam_L2 = np.sqrt(sigma2) * np.sqrt( 2. * (0.01 + np.log(n_vars - 1)) / n2) * 2 alphas = np.array([0.1, 0.5, 1., 5., 10., 50., 100.]) * lam_L2 ls_cv2 = LassoCV(cv=5, fit_intercept=False, alphas=alphas) ls_cv2.fit(X2[:, regressor_ind], Y2) coefs[:, 1] = ls_cv2.coef_ mu_L = (1. / np.sqrt(n1)) * scipy.stats.norm.ppf(1 - (0.1 / ((n_features**2)))) mu_L2 = (1. / np.sqrt(n2)) * scipy.stats.norm.ppf(1 - (0.1 / ((n_features**2)))) Theta1 = DecorrInverseCovQP(X1[:, regressor_ind], mu=mu_L) Theta2 = DecorrInverseCovQP(X2[:, regressor_ind], mu=mu_L2) coefs_debiased1 = coefs[:, 0] + Theta1.dot(X1[:, regressor_ind].T.dot( Y1 - X1[:, regressor_ind].dot(coefs[:, 0]))) / n1 coefs_debiased2 = coefs[:, 1] + Theta2.dot(X2[:, regressor_ind].T.dot( Y2 - X2[:, regressor_ind].dot(coefs[:, 1]))) / n2 coefs_debiased = coefs_debiased1 - coefs_debiased2 var_components1 = sigma1 * np.diag(Theta1.dot(Sigma1).dot(Theta1.T) / n1) var_components2 = sigma2 * np.diag(Theta2.dot(Sigma2).dot(Theta2.T) / n2) std_components = np.sqrt(var_components1 + var_components2) interval = scipy.stats.norm.ppf(1. - alp / 2.) * std_components LowerConfInterval = coefs_debiased - interval # confidence interval UpperConfInterval = coefs_debiased + interval Pvals = 2 * (1. - scipy.stats.norm.cdf(np.abs(coefs_debiased / (std_components)))) return coefs_debiased, Pvals, LowerConfInterval, UpperConfInterval
def alasso(X_Train, X_Test, Y_Train, Y_Test, target): X_Tr = copy.deepcopy(X_Train) X_Te = copy.deepcopy(X_Test) Y_Tr = copy.deepcopy(Y_Train) Y_Te = copy.deepcopy(Y_Test) [Tr_sample, num_feature] = X_Tr.shape dict = { 'Brazil': np.array([-10.0, 310.0]), 'Peru': np.array([-5.75, 283.0]), 'Asia': np.array([-10.0, 137.0]) } target_location = dict[target] resolution = { 332: [10, 'lat_lon_10x10.mat'], 1257: [5, 'lat_lon_5x5.mat'], 5881: [2.5, 'lat_lon.mat'] } step = resolution[num_feature][0] data = loadmat(resolution[num_feature][1]) position = data['lat_lon_data'] [row, column] = position.shape lat_block = np.zeros(row) lon_block = np.zeros(column) for i in range(row): lat_block[i] = -90 + step * i for i in range(column): lon_block[i] = 0 + i * step distance = np.zeros((row, column)) for i in range(row): for j in range(column): distance[i][j] = cal_dis(lat_block[i], lon_block[j], target_location[0], target_location[1]) max_value = np.amax(distance) for i in range(row): for j in range(column): distance[i][j] = distance[i][j] / max_value weight = np.zeros(num_feature) count = -1 for i in range(row): for j in range(column): if position[i][j] == 0: count = count + 1 weight[count] = distance[i][j] X_w = np.zeros((Tr_sample, num_feature)) for i in range(num_feature): for j in range(Tr_sample): X_w[j][i] = X_Tr[j][i] / weight[i] lasso_cv = LassoCV(alphas=np.linspace(0.01, 0.5, 25), fit_intercept=True, normalize=False, precompute='auto', max_iter=10000, tol=0.00001, copy_X=True, cv=10, verbose=False, n_jobs=1, positive=False, random_state=None, selection='cyclic') lasso_cv.fit(X_w, Y_Tr) l = lasso_cv.alpha_ beta = lasso_cv.coef_ beta_update = np.zeros(beta.shape) for i in range(num_feature): beta_update[i] = beta[i] / weight[i] Tr_pred = X_Tr.dot(beta_update) Te_pred = X_Te.dot(beta_update) beta = lasso_cv.coef_ r2_train = r2_score(Y_Tr, Tr_pred) r2_test = r2_score(Y_Te, Te_pred) rmse_train = sqrt(mean_squared_error(Y_Tr, Tr_pred)) rmse_test = sqrt(mean_squared_error(Y_Te, Te_pred)) return (Tr_pred, Te_pred)
features = features.dropna(axis=1) alpha_values = [] for a in range(1, 10001): alpha_values.append(a / 100) print "Started at " + str(datetime.now()) estimator_ridge = RidgeCV(alphas=alpha_values, cv=3) estimator_ridge.fit(features, goal) scores = cross_val_score(Ridge(alpha=estimator_ridge.alpha_), features, goal, cv=5) print "Ridge alpha " + str(estimator_ridge.alpha_) print str(np.mean(scores)) print scores estimator_lasso = LassoCV(alphas=alpha_values, cv=3) estimator_lasso.fit(features, goal) scores = cross_val_score(Lasso(alpha=estimator_lasso.alpha_), features, goal, cv=5) print "Lasso alpha " + str(estimator_lasso.alpha_) print str(np.mean(scores)) print scores estimator_elastic_net = ElasticNetCV(alphas=alpha_values, cv=3, n_jobs=-1) estimator_elastic_net.fit(features, goal) scores = cross_val_score(ElasticNet(alpha=estimator_elastic_net.alpha_), features, goal, cv=5) print "ElasticNet alpha " + str(estimator_elastic_net.alpha_) print str(np.mean(scores)) print scores print "Finished at " + str(datetime.now())
def train_and_score(X, y, occ_codes, lookup): lasso = LassoCV(max_iter=10000, cv = LeaveOneOut(), n_jobs = -1) lasso.fit(X, y) return score_model(lasso.alpha_, X, y, occ_codes, lookup)
def best_lasso(df, resp_var, exp_vars, kcv=3, cv_path=False, hists=False): """ Find the best lasso model through cross-validation. Args: df: Dataframe resp_var: String. Response variable exp_vars: List of strings. Explanatory variables kcv: Number of cross-validation folds cv_path: Whether to plot the path of cross-validation scores hists: Whether to plot histograms of coefficient estimates based on bootstrapping Returns: Dataframe of coefficients for best model and histograms of coefficient variability based on bootstrap resampling. """ import seaborn as sn import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LassoCV from sklearn.utils import resample # Standardise the feature data and response feat_std = StandardScaler().fit_transform(df[[resp_var,] + exp_vars]) model = LassoCV(fit_intercept=False, normalize=False, max_iter=10000, cv=kcv, eps=1e-3) # Train model on full dataset model.fit(feat_std[:, 1:], feat_std[:, 0]) print model # Get param estimates params = pd.DataFrame(pd.Series(model.coef_, index=exp_vars)) if cv_path: # Display results m_log_alphas = -np.log10(model.alphas_) plt.figure() plt.plot(m_log_alphas, model.mse_path_, ':') plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.legend() plt.xlabel('-log(alpha)') plt.ylabel('Mean square error') plt.axis('tight') plt.show() if hists: # Estimate confidence using bootstrap # i.e. what is the std. dev. of the estimates for each parameter # based on 1000 resamplings err = np.array([model.fit(*resample(feat_std[:, 1:], feat_std[:, 0])).coef_ for i in range(1000)]) err_df = pd.DataFrame(data=err, columns=exp_vars) # Melt for plotting with seaborn err_df = pd.melt(err_df) g = sn.FacetGrid(err_df, col="variable", col_wrap=4) g = g.map(plt.hist, "value", bins=20) # Vertical line at 0 g.map(sn.plt.axvline, x=0, c='k', lw=2) return params
mpl = MLPRegressor() poly = make_pipeline(PolynomialFeatures(2), Ridge()) knn = neighbors.KNeighborsRegressor(5, weights='distance') lasso = linear_model.Lasso(alpha=0.1, max_iter=100000) methods = [mpl, lassoCV, linear, poly, knn] m_names = [u'Neuralt Nätverk', 'Lasso', u'Linjär', 'Polynom', 'KNN', 'Q-Lasso'] type = 'ny' X = joblib.load(model_place + type + 'X.pkl') y = joblib.load(model_place + type + 'y.pkl') y_mean = np.mean(y) type = 'big' Xbig = joblib.load(model_place + type + 'X.pkl') ybig = joblib.load(model_place + type + 'y.pkl') print lassoCV.fit(X, y).coef_ print Xbig.shape print y_mean f, ax = plt.subplots(3, 2) for k in range(0, len(methods), 1): pred = cross_val_predict(methods[k], X, y, cv=10, verbose=True) temp_ax = ax[k / 2, k % 2] temp_ax.scatter(y, pred, marker='x', label=m_names[k], color=colors[1], alpha=0.15, s=1) temp_ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=1) scores(y, pred) temp_ax.legend(loc='upper left', fontsize=10) temp_ax.tick_params(labelsize=8) pred = cross_val_predict(lasso, Xbig, ybig, cv=10, verbose=True) k = 5 temp_ax = ax[2, 1]
#now, let's fit it to the total model, and compute predictions. model2_real = Lasso(alpha=0.01, max_iter=50000) model2_real.fit(scaled_X, y_log) reg = LassoCV(cv=5, random_state=0).fit(scaled_X, y_log) reg.score(scaled_X, y_log) Lasso_predictions = reg.predict(scaled_X_test) #let's try a new model: Random Forests from sklearn.ensemble import RandomForestRegressor reg_RF = RandomForestRegressor() reg_RF.fit(X_train, y_train) pred_cv = reg_RF.predict(X_cv) error_RF = np.sqrt(mean_squared_log_error(y_cv, pred_cv)) error_RF #that looks about the same reg.fit(scaled_X, y_log) RF_predictions = reg.predict(scaled_X_test) #let's compare Random Forests with Gradient Boosting from sklearn.ensemble import GradientBoostingRegressor reg_GB = GradientBoostingRegressor() reg_GB.fit(X_train, y_train) pred_cv = reg_GB.predict(X_cv) error_GB = np.sqrt(mean_squared_log_error(y_cv, pred_cv)) error_GB #Let's use GridSearchCV to find the best hyperparameters of our gradient boosting algo. Note: I've already completed this step, the results are below. #from sklearn.model_selection import GridSearchCV #param_grid = { # 'alpha' : [0.1,0.5,0.9], # 'n_estimators': [50,100,800,1500], # 'max_features': [20,15], # 'max_depth': [3,5,8,10],
data = data.values x = data[:, 2:].astype(np.float) y = data[:, 1].astype(np.int) columns = columns[2:] ss = StandardScaler() x = ss.fit_transform(x) # 增加一列全1 t = np.ones(x.shape[0]).reshape((-1, 1)) print t.shape, x.shape x = np.hstack((t, x)) # model = ElasticNetCV(alphas=np.logspace(-3, 2, 50), l1_ratio=[.1, .5, .7, .9, .95, .99, 1], fit_intercept=False) model = LassoCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False) model.fit(x, y) y_hat = model.predict(x) y_hat[y_hat < 0] = 0 print 'model.alpha = \t', model.alpha_ # print 'model.l1_ratio = \t', model.l1_ratio_ print 'model.coef_ = \n', model.coef_ print 'model.predict(x) = \n', y_hat print 'Acture = \n', y print 'RMSE:\t', np.sqrt(np.mean((y_hat-y)**2)) print 'R2:\t', model.score(x, y) for theta, col in zip(model.coef_[1:], columns): if theta > 0.01: print col, theta plt.figure(facecolor='w') t = np.arange(len(y))
def three_way_fit(clusters, energies, counts, comps, fold_pick=10, Normalize=True, Intercept=True, Energy_above_hull = True, name=''): ###- Lambda expression for scaling to energy above hull -### scale = lambda x0, y0, x1, y1, x2, y2: abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1) / np.sqrt( (y2 - y1) ** 2 + (x2 - x1) ** 2) ###- scale to energy above hull -### if Energy_above_hull == True: # If you want to FIT to energy above hull (which you don't) y1 = min(energies) y2 = max(energies) x2 = min(comps) x1 = max(comps) if x2==x1: energies = [energies[i]-y1 for i in range(len(energies))] else: energies = [scale(comps[i], energies[i], x1, y1, x2, y2) for i in range(len(energies))] ###- Set up text for output file -### file_out = 'Fit_summery.txt' file = open(file_out, 'w') file.write('Data set: ' + name + '\n\n' + 'Clusters: [[species],[distance],[chem (0) / spin (1)]]' + '\n') [file.write(str(clusters[i]) + '\n') for i in range(len(clusters))] file.write('\n\nEnergy per Atom (eV) : Cluster Count per Atom\n') for i in range(len(energies)): file.write(str(energies[i]) + ' : ' + str(counts[i]) + '\n') ###- Set up alphas for CV -### alpha_range = [-10, 10] alpha_lasso = np.logspace(alpha_range[0], alpha_range[1], num=1000) # for lasso cv n_alphas = 1010 alpha_ridge = np.logspace(-15, 10, n_alphas) # for ridge cv ###- Set range for plot -### axis_range = [min(energies) * 1.0001, max(energies) * .9999] # LASSO and RIDGE, Cross-Validation, Lin Reg without CV lassocv = LassoCV(alphas=alpha_lasso, normalize=Normalize, fit_intercept=Intercept, cv=fold_pick, max_iter=1e5) ridgecv = RidgeCV(alphas=alpha_ridge, normalize=Normalize, fit_intercept=Intercept, cv=None, store_cv_values=True) linreg = LinearRegression(fit_intercept=Intercept, normalize=Normalize) # Fit to data for each method lassocv.fit(counts, energies) # do fit for lasso ridgecv.fit(counts, energies) # do fit for ridge linreg.fit(counts, energies) # do fit for linreg lassocv_rmse = np.sqrt(lassocv.mse_path_) ridgecv_rmse = np.sqrt(ridgecv.cv_values_) # Set up Random Forrest regression, max depth is hard coded to 5 but this can be played with RandF_reg = RandomForestRegressor(max_depth=5, random_state=0) RandF_reg.fit(counts, energies) # for random forrest fit ### Get results ready for energy above hull plots ### y1 = min(energies) y2 = max(energies) x2 = min(comps) x1 = max(comps) scale = lambda x0, y0, x1, y1, x2, y2: abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1) / np.sqrt( (y2 - y1) ** 2 + (x2 - x1) ** 2) eahDFT = [scale(comps[i], energies[i], x1, y1, x2, y2) for i in range(len(energies))] axis_rangeEAH = [-0.002, max(eahDFT) * 1.1] ######################################################################################################################## ################ RANDOM FOREST FIT ##################################################################################### ######################################################################################################################## file.write("\n\n#### Random Forest #### ") # Plot Fit vs DFT cluster_energy_RF = RandF_reg.predict(counts) #print(RandF_reg.estimators_) # Comment in if you want to see the estimator generated by random forrest (its a bit messy) #print(RandF_reg.get_params()) # '' plt.figure() plt.scatter(energies, cluster_energy_RF, color='b', alpha=0.5) plt.plot(axis_range, axis_range, 'k', alpha=0.5) plt.xlim(axis_range) plt.ylim(axis_range) plt.gca().set_aspect('equal') plt.xlabel('Energy, DFT') plt.ylabel('Energy, CE') plt.title('Random Forest Fit Comparison: ' + name) plt.tight_layout() plt.show() eahCE = [scale(comps[i], cluster_energy_RF[i], x1, y1, x2, y2) for i in range(len(cluster_energy_RF))] plt.scatter(eahDFT, eahCE, color='b', alpha=0.5) plt.plot(axis_rangeEAH, axis_rangeEAH, 'k', alpha=0.5) plt.xlim(axis_rangeEAH) plt.ylim(axis_rangeEAH) plt.gca().set_aspect('equal') plt.xlabel('EAH, DFT') plt.ylabel('EAH, CE') plt.title('Random Forest Fit Comparison: ' + name) plt.tight_layout() plt.show() ######################################################################################################################## ################ LASSO FIT ############################################################################################# ######################################################################################################################## file.write("\n\n#### LASSO #### \nk-folds cross validation\n") file.write("alpha: %7.6f\n" % lassocv.alpha_) file.write("avg rmse: %7.4f\n" % min(lassocv_rmse.mean(axis=-1))) file.write("score: %7.4f\n" % lassocv.score(counts, energies)) file.write("non-zero coefficient: %7.4f\n" % np.count_nonzero(lassocv.coef_)) file.write('Intercept: ') file.write(str(lassocv.intercept_)) file.write('\n') # Show data from cross validation / cluster picking plt.figure() m_log_alphas = -np.log10(lassocv.alphas_) plt.plot(m_log_alphas, lassocv_rmse, ':') plt.plot(m_log_alphas, lassocv_rmse.mean(axis=-1), 'k', label='Average across the folds', linewidth=2) plt.axvline(-np.log10(lassocv.alpha_), linestyle='--', color='k', label='alpha: CV estimate') plt.xlabel('-log(alpha)') plt.ylabel('Root-mean-square error') plt.title('Root-mean-square error on each fold: ' + name) plt.legend() plt.tight_layout() plt.show() # Plot Fit vs DFT cluster_energy_ce = lassocv.predict(counts) plt.figure() plt.scatter(energies, cluster_energy_ce, color='b', alpha=0.5) plt.plot(axis_range, axis_range, 'k', alpha=0.5) plt.xlim(axis_range) plt.ylim(axis_range) plt.gca().set_aspect('equal') plt.xlabel('Energy, DFT') plt.ylabel('Energy, CE') plt.title('LASSO Fit Comparison: ' + name) plt.tight_layout() plt.show() eahCE = [scale(comps[i], cluster_energy_ce[i], x1, y1, x2, y2) for i in range(len(cluster_energy_ce))] plt.scatter(eahDFT, eahCE, color='b', alpha=0.5) plt.plot(axis_rangeEAH, axis_rangeEAH, 'k', alpha=0.5) plt.xlim(axis_rangeEAH) plt.ylim(axis_rangeEAH) plt.gca().set_aspect('equal') plt.xlabel('EAH, DFT') plt.ylabel('EAH, CE') plt.title('LASSO Fit Comparison: ' + name) plt.tight_layout() plt.show() # Show Non-zero clusters cluster_energy_new = [] for i in range(len(energies)): cluster_energy_new.append(energies[i] - cluster_energy_ce[i]) cluster_coef = [] cluster_pick = [] cluster_coef.append(lassocv.intercept_) cluster_coef_all = lassocv.coef_ cluster_nonzero = [c for c, v in enumerate(cluster_coef_all) if abs(v) >= 0.00000000001 ] for i in cluster_nonzero: cluster_coef.append(cluster_coef_all[i]) cluster_pick.append(clusters[i]) file.write("\n Clusters \n") for i in range(len(cluster_pick)): if len(cluster_pick[i]) == 2: file.write(str(cluster_pick[i][0]) + ':' + '[0]' + ':' + str(cluster_pick[i][1][0]) + ':' + str( cluster_coef[i + 1]) + '\n') else: file.write( str(cluster_pick[i][0]) + ':' + str(cluster_pick[i][1]) + ':' + str(cluster_pick[i][2][0]) + ':' + str( cluster_coef[i + 1]) + '\n') file.write("\n") file.write("\n") ######################################################################################################################## ############# RIDGE FIT ################################################################################################ ######################################################################################################################## file.write("### RIDGE ### \nk-folds cross validation\n") file.write("alpha: %7.6f\n" % ridgecv.alpha_) file.write("avg rmse: %7.4f\n" % min(ridgecv_rmse.mean(axis=-1))) file.write("score: %7.4f\n" % ridgecv.score(counts, energies)) file.write("non-zero coefficient: %7.4f\n" % np.count_nonzero(ridgecv.coef_)) # Plot Fit vs DFT cluster_energy_ce = ridgecv.predict(counts) plt.figure() plt.scatter(energies, cluster_energy_ce, color="r", alpha=0.5) plt.plot(axis_range, axis_range, 'k', alpha=0.5) plt.xlim(axis_range) plt.ylim(axis_range) plt.gca().set_aspect('equal') plt.xlabel('Energy, DFT') plt.ylabel('Energy, CE') plt.title('RIDGE Fit Comparison: ' + name) plt.tight_layout() plt.show() eahCE = [scale(comps[i], cluster_energy_ce[i], x1, y1, x2, y2) for i in range(len(cluster_energy_ce))] plt.scatter(eahDFT, eahCE, color='r', alpha=0.5) plt.plot(axis_rangeEAH, axis_rangeEAH, 'k', alpha=0.5) plt.xlim(axis_rangeEAH) plt.ylim(axis_rangeEAH) plt.gca().set_aspect('equal') plt.xlabel('EAH, DFT') plt.ylabel('EAH, CE') plt.title('RIDGE Fit Comparison: ' + name) plt.tight_layout() plt.show() # Show Non-zero clusters cluster_coef = [] cluster_pick = [] cluster_coef.append(ridgecv.intercept_) cluster_coef_all = ridgecv.coef_ cluster_nonzero = [c for c, v in enumerate(cluster_coef_all) if abs(v) >= 0.00000000001] for i in cluster_nonzero: cluster_coef.append(cluster_coef_all[i]) cluster_pick.append(clusters[i]) file.write("\n Clusters\n") for i in range(len(cluster_pick)): if len(cluster_pick[i]) == 2: file.write(str(cluster_pick[i][0]) + ':' + '[0]' + ':' + str(cluster_pick[i][1][0]) + ':' + str( cluster_coef[i + 1]) + '\n') else: file.write( str(cluster_pick[i][0]) + ':' + str(cluster_pick[i][1]) + ':' + str(cluster_pick[i][2][0]) + ':' + str( cluster_coef[i + 1]) + '\n') ######################################################################################################################## ############# LIN REG FIT ############################################################################################## ######################################################################################################################## file.write("\n #### Lin Reg #### \nNo cross validation\n") file.write("score: %7.4f\n" % ridgecv.score(counts, energies)) file.write("non-zero coefficient: %7.4f\n" % np.count_nonzero(ridgecv.coef_)) file.write('Intercept: %7.4f\n' % linreg.intercept_) # Plot Fit vs DFT cluster_energy_ce = linreg.predict(counts) plt.figure() plt.scatter(energies, cluster_energy_ce, color="g", alpha=0.5) plt.plot(axis_range, axis_range, 'k', alpha=0.5) plt.xlim(axis_range) plt.ylim(axis_range) plt.gca().set_aspect('equal') plt.xlabel('Energy, DFT') plt.ylabel('Energy, CE') plt.title('LinReg Fit Comparison: ' + name) plt.tight_layout() plt.show() eahCE = [scale(comps[i], cluster_energy_ce[i], x1, y1, x2, y2) for i in range(len(cluster_energy_ce))] plt.scatter(eahDFT, eahCE, color='g', alpha=0.5) plt.plot(axis_rangeEAH, axis_rangeEAH, 'k', alpha=0.5) plt.xlim(axis_rangeEAH) plt.ylim(axis_rangeEAH) plt.gca().set_aspect('equal') plt.xlabel('EAH, DFT') plt.ylabel('EAH, CE') plt.title('LinReg Fit Comparison: ' + name) plt.tight_layout() plt.show() # Show Non-zero clusters cluster_coef = [] cluster_pick = [] cluster_coef.append(linreg.intercept_) cluster_coef_all = linreg.coef_ cluster_nonzero = [c for c, v in enumerate(cluster_coef_all) if abs(v) >= 0.00000000001] for i in cluster_nonzero: cluster_coef.append(cluster_coef_all[i]) cluster_pick.append(clusters[i]) file.write('\nClusters\n') for i in range(len(cluster_pick)): if len(cluster_pick[i]) == 2: file.write(str(cluster_pick[i][0]) + ':' + '[0]' + ':' + str(cluster_pick[i][1][0]) + ':' + str( cluster_coef[i + 1]) + '\n') else: file.write( str(cluster_pick[i][0]) + ':' + str(cluster_pick[i][1]) + ':' + str(cluster_pick[i][2][0]) + ':' + str( cluster_coef[i + 1]) + '\n') file.write('\n') file.close() return
def Model(train_linear, test_linear): train_linear_fea=train_linear.drop(columns=['SalePrice']) train_linear_tar=train_linear.SalePrice x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0) def evaluate(model, test_features, test_labels,train_features, train_labels): predictions = model.predict(test_features) errors = abs(predictions - test_labels) mape = 100 * np.mean(errors / test_labels) accuracy = 100 - mape print('Model Performance') print('Average Error: {:0.4f} degrees.'.format(np.mean(errors))) print('Accuracy = {:0.2f}%.'.format(accuracy)) print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train))) print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test))) return accuracy real_train_tar=np.expm1(train_linear_tar) """ . Lasso model """ lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), ) lassocv.fit(train_linear_fea, train_linear_tar) lassocv_score = lassocv.score(train_linear_fea, train_linear_tar) lassocv_alpha = lassocv.alpha_ print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score) start=time.time() lasso =Lasso(normalize = True) lasso.set_params(alpha=lassocv_alpha,max_iter = 10000) lasso.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, lasso.predict(x_test)) coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(lasso,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_lasso_predict=lasso.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_lasso_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_lasso=np.expm1(lasso.predict(test_linear)) """ . Ridge model """ ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400)) ridgecv.fit(x_train, y_train) ridgecv_score = ridgecv.score(x_train, y_train) ridgecv_alpha = ridgecv.alpha_ print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score) coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) start=time.time() ridge =Ridge(normalize = True) ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000) ridge.fit(x_train, y_train) end=time.time() mean_squared_error(y_test, ridge.predict(x_test)) coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False) evaluate(ridge,x_test,y_test,x_train,y_train) print('Time elapsed: %.4f seconds' % (end-start)) y_ridge_predict=ridge.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_ridge_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') test_prediction_ridge=np.expm1(ridge.predict(test_linear)) """ . Random Forest """ #train=train.drop(columns=['DateSold']) #test=test.drop(columns=['DateSold']) #X_train=train.drop(columns=['SalePrice']) #Y_train=train['SalePrice'] X_train=train_linear_fea Y_train=train_linear_tar x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0) n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] max_features = ['auto', 'sqrt'] max_depth = [int(x) for x in np.linspace(10, 110, num = 11)] min_samples_split = [2, 5, 10] min_samples_leaf = [1, 2, 4] bootstrap = [True, False] random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth': max_depth, 'min_samples_split': min_samples_split, 'min_samples_leaf': min_samples_leaf, 'bootstrap': bootstrap} rf = RandomForestRegressor() # Random search of parameters, using 3 fold cross validation, # search across 100 different combinations, and use all available cores # rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1) rf_random.fit(X_train, Y_train) #rf_random.fit(x_train_rf, y_train_rf) rf_random.best_params_ #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search, # we can explicitly specify every combination of settings to try. param_grid = { 'bootstrap': [False], 'max_depth': [80, 90, 100, 110,120,130], 'max_features': [2, 3], 'min_samples_leaf': [1,2,3, 4], 'min_samples_split': [2,4,6,8, 10, 12], 'n_estimators': [600,700, 800, 900, 1000] } # Create a based model rf = RandomForestRegressor() # Instantiate the grid search model grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2) #grid_search.fit(x_train, y_train) grid_search.fit(X_train, Y_train) grid_search.best_params_ best_random = grid_search.best_estimator_ start=time.time() best_random.fit(x_train_rf,y_train_rf) end=time.time() evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_rf_predict=best_random.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_rf_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_rf = importance_rf.iloc[:20,] plt.barh(importance_top20_rf.features, importance_top20_rf.imp) plt.xlabel('Feature Importance') test_prediction_rf=np.expm1(best_random.predict(test_linear)) """ . Xgboost """ learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)] # Minimum for sum of weights for observations in a node min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] # Maximum nodes in each tree max_depth = [int(x) for x in np.linspace(1, 10, num = 10)] n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)] subsample=[0.3, 0.4,0.5,0.6, 0.7] model = xgb.XGBRegressor() random_grid = {'learning_rate': learning_rate, 'max_depth': max_depth, 'min_child_weight': min_child_weight, 'subsample': subsample, 'n_estimators':n_estimators } # Make a RandomizedSearchCV object with correct model and specified hyperparams xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1) start = time.time() # Fit models xgb_random.fit(X_train, Y_train) xgb_random.best_params_ """ best_params_={'learning_rate': 0.1, 'max_depth': 2, 'min_child_weight': 4, 'n_estimators': 900, 'subsample': 0.5} """ model_xgb = XGBRegressor(**xgb_random.best_params_) #model_xgb = XGBRegressor(**best_params_) start=time.time() model_xgb.fit(x_train_rf,y_train_rf) end=time.time() evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf) print('Time elapsed: %.4f seconds' % (end-start)) y_xgb_predict=model_xgb.predict(train_linear_fea) x_line = np.arange(700000) y_line=x_line plt.scatter(real_train_tar,np.expm1(y_xgb_predict)) plt.plot(x_line, y_line, color='r') plt.xlabel('Actual Sale Price') plt.ylabel('Predict Sle Price') importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\ sort_values('imp',ascending=False) importance_top20_xgb = importance_xgb.iloc[:20,] plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp) plt.xlabel('Feature Importance') test_prediction_xgb=np.expm1(model_xgb.predict(test_linear)) return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
lasso_cofficients.append(lasso.coef_) # 绘制Lambda与回归系数的关系 plt.plot(Lambdas, lasso_cofficients) # 对x轴作对数变换 plt.xscale('log') # 设置折线图x轴和y轴标签 plt.xlabel('Lambda') plt.ylabel('Cofficients') # 显示图形 plt.show() # LASSO回归模型的交叉验证 lasso_cv = LassoCV(alphas = Lambdas, normalize=True, cv = 10, max_iter=10000) lasso_cv.fit(X_train, y_train) # 输出最佳的lambda值 lasso_best_alpha = lasso_cv.alpha_ lasso_best_alpha # 基于最佳的lambda值建模 lasso = Lasso(alpha = lasso_best_alpha, normalize=True, max_iter=10000) lasso.fit(X_train, y_train) # 返回LASSO回归的系数 pd.Series(index = ['Intercept'] + X_train.columns.tolist(),data = [lasso.intercept_] + lasso.coef_.tolist()) # 预测 lasso_predict = lasso.predict(X_test) # 预测效果验证 RMSE = np.sqrt(mean_squared_error(y_test,lasso_predict))
clf.fit_cv(X_train, Y_train, [(X_cv, Y_cv)]) else: clf.fit(X_train, Y_train) one_result = clf.predict(X_cv) blend_train[cv_index, j] = one_result cv_score = gini_normalized(Y_cv, blend_train[cv_index, j]) cv_results[j, i] = cv_score score_mse = metrics.mean_absolute_error(Y_cv, one_result) print ('Fold [%s] norm. Gini = %0.5f, MSE = %0.5f' % (i, cv_score, score_mse)) blend_test_j[:, i] = clf.predict(X_test) blend_test[:, j] = blend_test_j.mean(1) print ('Clf_%d Mean norm. Gini = %0.5f (%0.5f)' % (j, cv_results[j,].mean(), cv_results[j,].std())) end_time = datetime.now() time_taken = (end_time - start_time) print ("Time taken for pre-blending calculations: {0}".format(time_taken)) print ("CV-Results", cv_results) print ("Blending models.") bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True) bclf.fit(blend_train, Y_dev) Y_test_predict = bclf.predict(blend_test) cv_score = cv_results.mean() print ('Avg. CV-Score = %s' % (cv_score)) submission = pd.DataFrame({"Id": test_ids, "Hazard": Y_test_predict}) submission = submission.set_index('Id') submission.to_csv("farons_solution.csv")
lassocv = LassoCV(cv=10, fit_intercept=False) l = 1 seed(l) for i in np.arange(len(ds)): d = ds[i] X = np.random.multivariate_normal(np.zeros(d), cov_mat[i], N) inv_cov = ndws(X) for ii in np.arange(len(ss)): s = ss[ii] beta_s = np.concatenate((np.ones(s), np.zeros(d - s))) mu = X.dot(beta_s) y = np.random.normal(mu, 1) try: lassocv.fit(X, y) beta_h = lassocv.coef_ g = -X.T.dot(y - X.dot(beta_h)) / N beta_db = beta_h - inv_cov.dot(g) beta_d = beta_db - beta_s ts[i, ii, :, 0] = np.array([ norm(beta_d, np.inf), norm(beta_d, 2), norm(beta_d, 1), np.abs(beta_d[1]) ]) ts[i, ii, :, 1] = np.array([ norm(beta_d[:s], np.inf), norm(beta_d[:s], 2), norm(beta_d[:s], 1), np.abs(beta_d[:s][1])
from sklearn.linear_model import LassoCV import pandas as pd train_data=pd.read_csv('D:\sufe\A\data_train_changed.csv') train_data=train_data.ix[0:,1:].drop(['REPORT_ID',"ID_CARD",'LOAN_DATE'],1) train_data=train_data.dropna() # print(train_data.info()) X=train_data.drop(['Y'],1).as_matrix()#7 y=train_data['Y'].as_matrix()#1 lassocv = LassoCV() lassocv.fit(X,y) print(train_data.columns.drop('Y'),lassocv.coef_)
for feat in feats: print(gene_cols[feat]) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') feat1 = X_transform[:, 0] feat2 = X_transform[:, 1] ax.scatter(feat1, feat2, tol) plt.xlabel("feat1") plt.ylabel("feat2") plt.show() use_svr = input("Use svr? ") if "yes" in use_svr: clf = SVR(kernel='linear', C=1.0, epsilon=0.2) clf.fit(X, y) selector = RFE(clf, 5, step=5) selector = selector.fit(X, y) gene_pos = selector.get_support(indices=True) for pos in gene_pos: print(gene_cols[pos] + '\t' + str(pos)) if True == False: pca = PCA() pca.fit(exp_matrix) selector = VarianceThreshold() selector.fit(exp_matrix) exp_vars = selector.variances_ av_var = sum(exp_vars) / len(exp_vars) std_var = np.std(exp_vars)
for i in range(len(y)): if y[i]>0: binary_y_pre.append(1) else: binary_y_pre.append(0) binary_y = np.array(binary_y_pre) coef_path_linear_cv = LinearRegression(normalize=Normalize,fit_intercept=Fit_Intercept) coef_path_lasso_cv = LassoCV(normalize=Normalize, max_iter=Max_Iter, copy_X=True, cv=CV, verbose=Verbose, fit_intercept=Fit_Intercept, tol=Tol)#, alphas=Alphas) coef_path_elastic_cv = ElasticNetCV(normalize=Normalize,max_iter=Max_Iter, tol=Tol)#,alphas=Alphas) coef_path_logistic_cv = LogisticRegression( tol=Tol) coef_path_binary_x_logistic_cv = LogisticRegression( tol=Tol) coef_path_forest_cv = RandomForestClassifier(n_estimators = N_Estimators, max_features=number_of_features) binary_X = vectorizer_binary.fit_transform(corpus) coef_path_forest_cv.fit(X,binary_y) coef_path_lasso_cv.fit(X,y) coef_path_binary_x_logistic_cv.fit(binary_X,binary_y) coef_path_logistic_cv.fit(X,binary_y) coef_path_elastic_cv.fit(X,y) forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring) logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') binary_x_logistic_cv_score = cross_validation.cross_val_score(coef_path_binary_x_logistic_cv, binary_X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc') forest_results_parameters = [ coef_path_forest_cv.predict(X), coef_path_forest_cv.get_params, coef_path_forest_cv.feature_importances_, coef_path_forest_cv.classes_, coef_path_forest_cv.n_classes_] forest_scores = [forest_cv_score, classification_report(binary_y, forest_results_parameters[0]), 'forest'] lasso_results_parameters = [coef_path_lasso_cv.predict(X), coef_path_lasso_cv.get_params, coef_path_lasso_cv.alphas_, coef_path_lasso_cv.coef_]
def feature_selection(method, instance_db, target, percentage): instance_db_values = instance_db.values if method == "lasso": lassocv = LassoCV(max_iter=5000, normalize=True, alphas=[0.0001]) lassocv.fit(instance_db_values, target) # sort features according to coef_ coef = abs(lassocv.coef_) feature = zip(instance_db.columns, coef) feature = sorted(feature, key=lambda tup: tup[1], reverse=True) features = pd.DataFrame([tup[0] for tup in feature]) # the features been chosen mask = pd.DataFrame([ item[1] for item in feature ]) >= feature[int(len(feature) * percentage) - 1][1] feature_selected = pd.DataFrame(features)[:][np.asarray(mask).reshape( -1)] feature_selected = [ item for items in feature_selected.values for item in items ] if method == "pearson": scores, _ = pearson(instance_db_values, target) scores = np.nan_to_num(scores) scores = [abs(term) for term in scores] # sort features according to cores_ feature = zip(instance_db.columns, scores) feature = sorted(feature, key=lambda tup: tup[1], reverse=True) features = pd.DataFrame([tup[0] for tup in feature]) # the features been chosen mask = pd.DataFrame([ item[1] for item in feature ]) >= feature[int(len(feature) * percentage - 1)][1] feature_selected = pd.DataFrame(features)[:][np.asarray(mask).reshape( -1)] feature_selected = [ item for items in feature_selected.values for item in items ] if method == "reliefF": select = ReliefF(n_neighbors=5) instance_db_values = np.array(instance_db_values, np.float64) max_batch_instance = 5000 if instance_db_values.shape[0] > max_batch_instance: for i in range( int(instance_db_values.shape[0] / max_batch_instance)): instance = instance_db_values[i * max_batch_instance:(i + 1) * max_batch_instance] tar = target[i * max_batch_instance:(i + 1) * max_batch_instance] select.fit_transform(instance, tar) if i == 0: feature_impotances = select.feature_importances_ else: feature_impotances += np.asarray( select.feature_importances_) feature_impotances = list(feature_impotances) else: select.fit_transform(instance_db_values, target) feature_impotances = list(select.feature_importances_) # sort features according to importance_ feature = zip(instance_db.columns, feature_impotances) feature = sorted(feature, key=lambda tup: tup[1], reverse=True) features = pd.DataFrame([tup[0] for tup in feature]) # the features been chosen mask = pd.DataFrame([ item[1] for item in feature ]) >= feature[int(len(feature) * percentage - 1)][1] feature_selected = pd.DataFrame(features)[:][np.asarray(mask).reshape( -1)] feature_selected = [ item for items in feature_selected.values for item in items ] return feature_selected
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir): """ Train one of the built-in linear regression models. Parameters ---------- model_name : str Name of the built-in model to train. df_train : pandas DataFrame Data frame containing the features on which to train the model. experiment_id : str The experiment ID. csvdir : str Path to the `output` experiment output directory. figdir : str Path to the `figure` experiment output directory. Returns ------- learner : skll Learner object SKLL LinearRegression Learner object containing the coefficients learned by training the built-in model. """ # get the columns that actually contain the feature values feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']] # LinearRegression (formerly empWt) : simple linear regression if model_name == 'LinearRegression': # get the feature columns X = df_train[feature_columns] # add the intercept X = sm.add_constant(X) # fit the model fit = sm.OLS(df_train['sc1'], X).fit() df_coef = ols_coefficients_to_dataframe(fit.params) learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # EqualWeightsLR (formerly eqWt) : all features get equal weight elif model_name == 'EqualWeightsLR': # we first compute a single feature that is simply the sum of all features df_train_eqwt = df_train.copy() df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1) # train a plain Linear Regression model X = df_train_eqwt['sumfeature'] X = sm.add_constant(X) fit = sm.OLS(df_train_eqwt['sc1'], X).fit() # get the coefficient for the summed feature and the intercept coef = fit.params['sumfeature'] const = fit.params['const'] # now we need to assign this coefficient to all of the original # features and create a fake SKLL learner with these weights original_features = [c for c in df_train_eqwt.columns if c not in ['sc1', 'sumfeature', 'spkitemid']] coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)])) df_coef = ols_coefficients_to_dataframe(coefs) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # RebalancedLR (formerly empWtBalanced) : balanced empirical weights # by changing betas [adapted from http://bit.ly/UTP7gS] elif model_name == 'RebalancedLR': # train a plain Linear Regression model X = df_train[feature_columns] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_params = ols_coefficients_to_dataframe(fit.params) df_params = df_params.set_index('feature') # compute the betas for the non-intercept coefficients df_weights = df_params.loc[feature_columns] df_betas = df_weights.copy() df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std() # replace each negative beta with delta and adjust # all the positive betas to account for this RT = 0.05 df_positive_betas = df_betas[df_betas['coefficient'] > 0] df_negative_betas = df_betas[df_betas['coefficient'] < 0] delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas) df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1) # rescale the adjusted betas to get the new coefficients df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index') # add the intercept back to the new coefficients df_coef['Intercept'] = df_params.loc['Intercept'].coefficient df_coef = df_coef.sort_index().reset_index() df_coef.columns = ['feature', 'coefficient'] # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature # selection using lasso regression with a fixed lambda and then # use only those features to train a second linear regression elif model_name == 'LassoFixedLambdaThenLR': # train a Lasso Regression model with this featureset with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train a new vanilla linear regression with just the non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # get the coefficients data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature # selection using lasso regression optimized for log likelihood using # cross validation and then use only those features to train a # second linear regression elif model_name == 'PositiveLassoCVThenLR': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # NNLR (formerly empWtNNLS) : First do feature selection using # non-negative least squares (NNLS) and then use only its non-zero # features to train a regular linear regression. We do the regular # LR at the end since we want an LR object so that we have access # to R^2 and other useful statistics. There should be no difference # between the non-zero coefficients from NNLS and the coefficients # that end up coming out of the subsequent LR. elif model_name == 'NNLR': # add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters to a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # LassoFixedLambdaThenNNLR (formerly empWtDropNegLasso): First do # feature selection using lasso regression and positive only weights. # Then fit an NNLR (see above) on those features. elif model_name == 'LassoFixedLambdaThenNNLR': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train an NNLS regression using these non-zero features # first add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the positive features used_features = non_zero_features # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with # a fixed lambda elif model_name == 'LassoFixedLambda': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation alpha = p_lambda / len(df_train) learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True}) learner.train(fs_train, grid_search=False) # convert this model's parameters to a data frame df_coef = skll_learner_params_to_dataframe(learner) # there's no OLS fit object in this case fit = None # we used all the features used_features = feature_columns # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection # using lasso regression optimized for log likelihood using cross # validation. elif model_name == 'PositiveLassoCV': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # save the non-zero model coefficients and intercept to a data frame non_zero_features, non_zero_feature_values = [], [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) non_zero_feature_values.append(coefficient) # initialize the coefficient data frame with just the intercept df_coef = pd.DataFrame([('Intercept', model.intercept_)]) df_coef = df_coef.append(list(zip(non_zero_features, non_zero_feature_values)), ignore_index=True) df_coef.columns = ['feature', 'coefficient'] # create a fake SKLL learner with these non-zero weights learner = create_fake_skll_learner(df_coef) # there's no OLS fit object in this case fit = None # we used only the non-zero features used_features = non_zero_features # save the raw coefficients to a file df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False) # compute the standardized and relative coefficients (betas) for the # non-intercept features and save to a file df_betas = df_coef.set_index('feature').loc[used_features] df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std() df_betas.columns = ['standardized'] df_betas['relative'] = df_betas / sum(abs(df_betas['standardized'])) df_betas.reset_index(inplace=True) df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False) # save the OLS fit object and its summary to files if fit: ols_file = join(csvdir, '{}.ols'.format(experiment_id)) summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id)) with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf: pickle.dump(fit, olsf) summf.write(str(fit.summary())) # create a data frame with main model fit metrics and save to the file df_model_fit = model_fit_to_dataframe(fit) model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id)) df_model_fit.to_csv(model_fit_file, index=False) # save the SKLL model to a file model_file = join(csvdir, '{}.model'.format(experiment_id)) learner.save(model_file) return learner
print n p = 101 K = 10 # K-fold CV y = y.reshape(n) alphas = np.exp(np.linspace(np.log(0.01), np.log(1), 100)) # Using log-scale N = len(alphas) # Number of lasso parameters scores = np.zeros(N) alpha = np.zeros(N) from sklearn.linear_model import LassoCV from sklearn.feature_selection import SelectFromModel for i in range(N): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf = LassoCV(n_alphas=100, cv=K) clf = clf.fit(X_train, y_train) scores[i] = clf.score(X_test, y_test) alpha[i] = clf.alpha_ scores = np.asarray(scores) max_score_index = np.argmax(scores) best_alpha = alpha[max_score_index] print(best_alpha) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) clf = Lasso(alpha=best_alpha) #clf = LassoCV(n_alphas = 100, cv = K, precompute='auto', n_jobs=2, normalize='True') clf = clf.fit(X_train, y_train) scores = clf.score(X_test, y_test) print(predictor_var[0])
print("EVS:",explained_variance_score(y_test,y_p)) md=dnn_reg(X_train,y_train,X_test,y_test) reg_eval(X_test,y_test,md) ###Lasso CV regression def reg_eval2(y_test,model): y_pred=model.predict(X_test) print("evaluation the results for model:",model) print("MSE:",mean_squared_error(y_test,y_pred)) print("R2:",r2_score(y_test,y_pred)) print("EVS:",explained_variance_score(y_test,y_pred)) lasso = LassoCV(cv=5, random_state=0,max_iter=10000) lasso.fit(X_train,y_train) reg_eval2(y_test,lasso) #ElasticNet Regressionb ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77) ela.fit(X_train,y_train) print("R square:",ela.score(X_test,y_test)) reg_eval2(y_test,ela) #SVR Regression from sklearn.svm import LinearSVR LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000) # scaler=RobustScaler() # pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)]) LSVR.fit(X_train,y_train)
#affichage des coefficients pour alpha = 0.25 print(pandas.DataFrame({'Variables': nom_var, 'Coefficients': coefs25})) #validation croisée pour Lasso from sklearn.linear_model import LassoCV #outil pour la détection de la solution la plus performante en validation croisée lcv = LassoCV(alphas=my_alphas, normalize=False, fit_intercept=False, random_state=0, cv=5) #lancement sur l'échantillon d'apprentissage lcv.fit(ZTrain[:, :16], ZTrain[:, 16]) #valeurs des alphas qui ont été testés print(lcv.alphas_) #valeurs des MSE en validation croisée print(lcv.mse_path_) #moyenne mse en validation croisée pour chaque alpha avg_mse = numpy.mean(lcv.mse_path_, axis=1) #alphas vs. MSE en cross-validation print(pandas.DataFrame({'alpha': lcv.alphas_, 'MSE': avg_mse})) #sous-forme graphique plt.plot(lcv.alphas_, avg_mse)
msg("Fitting!") weights = np.ones(train.shape[0]) do_statsmodels=True if do_statsmodels: ols = sm.wls(formula=formula, data=train, weights=weights).fit() print ols.summary() msg("Making predictions for all playergames") yy_df['ols_prediction'] = ols.predict(yy_df) else: ols_lr = LassoCV(n_jobs=-1, verbose=True) X = train[rhs_cols] y = train['elo'] ols_lr.fit(X,y) yy_df['ols_prediction'] = ols_lr.predict(X) yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs() yy_df['training'] = (yy_df['gamenum'] % 3) insample_scores = yy_df.groupby('training')['ols_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std}) print insample_scores msg("Error summary by ELO:") elo_centuries = cut(yy_df['elo'], 20) print yy_df.groupby(elo_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean}) msg("Error summary by gamenum:") gamenum_centuries = cut(yy_df['gamenum'], 20) print yy_df.groupby(gamenum_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean})
print(train_data.shape) print(test_data.shape) params = { 'loss': 'ls', 'learning_rate': 0.022, 'n_estimators': 2825, 'max_depth':4, 'subsample':0.9, 'min_samples_split':2, 'min_samples_leaf':1, 'random_state':1, 'max_features':'log2', 'alpha':0.9} model_gbr = GradientBoostingRegressor(**params) y_predict_gbr=model_gbr.fit(train_data, train_data_y).predict(test_data) model_Lasso = LassoCV(normalize=False, alphas=np.arange(0.0001,0.01,0.0001),cv=ShuffleSplit(n_splits=5,test_size=0.2),n_jobs=-1) y_predict_lasso=model_Lasso.fit(train_data, train_data_y).predict(test_data) model_bridge = BayesianRidge() y_predict_bridge=model_bridge.fit(train_data, train_data_y).predict(test_data) answer_true=pd.read_csv('D:\desktop\天池\AI\season two\[new] fusai_answer_a_20180127.csv',header=None).iloc[:,-1] p=np.arange(0.1,1.0,0.001) for i in list(p): y_predict=i*y_predict_gbr+(1-i)*y_predict_lasso mse1 = metrics.mean_squared_error(y_predict, answer_true) if mse1<0.026: print(mse1)