def train_BayesianRegressionModel( X, y, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False, ): """ Train a Bayesian regression model """ model = BayesianRidge( n_iter=n_iter, tol=tol, alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, compute_score=compute_score, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, verbose=verbose, ) model = model.fit(X, y) return model
def bayesian_ridge_regression(feature_array, label_array): clf = BayesianRidge(compute_score=True) clf.fit(feature_array, label_array) ols = LinearRegression() ols.fit(feature_array, label_array) n_features = 9 plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate") plt.plot(label_array, 'g-', label="Ground truth") plt.plot(ols.coef_, 'r--', label="OLS estimate") plt.xlabel("Features") plt.ylabel("Values of the weights") plt.legend(loc="best", prop=dict(size=12)) plt.figure(figsize=(6, 5)) plt.title("Histogram of the weights") plt.hist(clf.coef_, bins=n_features, log=True) # plt.plot(clf.coef_[feature_array], 5 * np.ones(len(feature_array)), # 'ro', label="Relevant features") plt.ylabel("Features") plt.xlabel("Values of the weights") plt.legend(loc="lower left") plt.figure(figsize=(6, 5)) plt.title("Marginal log-likelihood") plt.plot(clf.scores_) plt.ylabel("Score") plt.xlabel("Iterations") plt.show()
def bayes_ridge_reg(self): br = BayesianRidge() br.fit(self.x_data, self.y_data) adjusted_result = br.predict(self.x_data) print "bayes ridge params", br.coef_, br.intercept_ print "bayes ridge accuracy", get_accuracy(adjusted_result, self.y_data) return map(int, list(adjusted_result))
def ridreg(df,test): clf = BayesianRidge() target = df['count'] train = df[['time','temp']] test = test2[['time','temp']] clf.fit(train,target) final = [] print(test.head(3)) for i, row in enumerate(test.values): y=[] for x in row: x= float(x) y.append(x) # print(x) final.append(y) predicted_probs= clf.predict(final) # print(predicted_probs.shape) # predicted_probs = pd.Series(predicted_probs) # predicted_probs = predicted_probs.map(lambda x: int(x)) keep = pd.read_csv('data/test.csv') keep = keep['datetime'] # #save to file predicted_probs= pd.DataFrame(predicted_probs) print(predicted_probs.head(3)) predicted_probs.to_csv('data/submission3.csv',index=False)
def bayesRegr(source, target): # Binarize source clf = BayesianRidge() features = source.columns[:-1] klass = source[source.columns[-1]] clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) return preds
def br_modeling(data, y_name, candidates_location): from sklearn.linear_model import BayesianRidge temp = data.copy() candidates = get_variables("./%s" % candidates_location) temp = rf_trim(temp, y_name, candidates) model = BayesianRidge() res = model.fit(temp[candidates], temp[y_name]) joblib.dump(res, "./%sbr_model%s.pkl" % (y_name, datetime.datetime.today())) return res
def fit_model_10(self,toWrite=False): model = BayesianRidge(n_iter=5000) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 10 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model10/model.pkl','w') pickle.dump(model,f2) f2.close()
def br_modeling(data,y_name,candidates_location): from sklearn.linear_model import BayesianRidge temp=data.copy() print("made temp copy") candidates=get_variables("./%s"%candidates_location) print("got candidates for regressors") temp=rf_trim(temp,y_name,candidates) print("trimmed dataset") model=BayesianRidge() print("assigned model") res=model.fit(temp[candidates],temp[y_name]) print("fit model") joblib.dump(res,"./%sbr_model%s.pkl"%(y_name,datetime.datetime.today())) print("saved model") return res
def fit_polynomial_bayesian_skl(X, Y, degree, lambda_shape=1.e-6, lambda_invscale=1.e-6, padding=10, n=100, X_unknown=None): X_v = pol.polyvander(X, degree) clf = BayesianRidge(lambda_1=lambda_shape, lambda_2=lambda_invscale) clf.fit(X_v, Y) coeff = np.copy(clf.coef_) # there some weird intercept thing # since the Vandermonde matrix has 1 at the beginning, just add this # intercept to the first coeff coeff[0] += clf.intercept_ ret_ = [coeff] # generate the line x = np.linspace(X.min()-padding, X.max()+padding, n) x_v = pol.polyvander(x, degree) # using the provided predict method y_1 = clf.predict(x_v) # using np.dot() with coeff y_2 = np.dot(x_v, coeff) ret_.append(((x, y_1), (x, y_2))) if X_unknown is not None: xu_v = pol.polyvander(X_unknown, degree) # using the predict method yu_1 = clf.predict(xu_v) # using np.dot() with coeff yu_2 = np.dot(xu_v, coeff) ret_.append(((X_unknown, yu_1), (X_unknown, yu_2))) return ret_
def train_classiifer(X_train, y_train, to_tune, classifier): # Initialize Classifier. clf = BayesianRidge() clf = SVR(kernel='rbf', C=1e3, gamma=0.1) #clf = RandomForestRegressor() if classifier: clf = classifier to_tune = False if to_tune: # Grid search: find optimal classifier parameters. param_grid = {'alpha_1': sp_rand(), 'alpha_2': sp_rand()} param_grid = {'C': sp_rand(), 'gamma': sp_rand()} rsearch = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, n_iter=5000) rsearch.fit(X_train, y_train) # Use tuned classifier. clf = rsearch.best_estimator_ # Trains Classifier clf.fit(X_train, y_train) return clf
def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features): """ Constructing a Bayesian ridge regression model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ clf = BayesianRidge() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) # Optimal ridge regression alpha value from CV ridge_alpha = clf.alpha_ with open('../trained_networks/brr_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
# Lasso lassoCV_model = LassoCV(alphas = alphas) lassoCV_model.fit(X_train, y_train); We can then see which values of `alpha` performed best with the following. print('Ridge alpha:', ridgeCV.alpha_) print('Lasso alpha:', lassoCV.alpha_) ## Bayesian Regression We can also fit Bayesian regression using `scikit-learn` (though another popular package is `pymc3`). A very straightforward implementation is provided below. from sklearn.linear_model import BayesianRidge bayes_model = BayesianRidge() bayes_model.fit(X_train, y_train); This is not, however, identical to our construction in the previous section since it infers the $\sigma^2$ and $\tau$ parameters, rather than taking those as fixed inputs. More information can be found [here](https://scikit-learn.org/stable/modules/linear_model.html#bayesian-regression). The hidden chunk below demonstrates a hacky solution for running Bayesian regression in `scikit-learn` using known values for $\sigma^2$ and $\tau$, though it is hard to imagine a practical reason to do so ````{toggle} By default, Bayesian regression in `scikit-learn` treats $\alpha = \frac{1}{\sigma^2}$ and $\lambda = \frac{1}{\tau}$ as random variables and assigns them the following prior distributions $$ \begin{aligned} \alpha &\sim \text{Gamma}(\alpha_1, \alpha_2) \\ \lambda &\sim \text{Gamma}(\lambda_1, \lambda_2). \end{aligned} $$
from sklearn.pipeline import make_pipeline from sklearn.model_selection import cross_val_score N_SPLITS = 5 rng = np.random.RandomState(0) X_full, y_full = fetch_california_housing(return_X_y=True) # ~2k samples is enough for the purpose of the example. # Remove the following two lines for a slower run with different error bars. X_full = X_full[::10] y_full = y_full[::10] n_samples, n_features = X_full.shape # Estimate the score on the entire dataset, with no missing values br_estimator = BayesianRidge() score_full_data = pd.DataFrame(cross_val_score( br_estimator, X_full, y_full, scoring='neg_mean_squared_error', cv=N_SPLITS), columns=['Full Data']) # Add a single missing value to each row X_missing = X_full.copy() y_missing = y_full missing_samples = np.arange(n_samples) missing_features = rng.choice(n_features, n_samples, replace=True) X_missing[missing_samples, missing_features] = np.nan
# Create weigts with a precision lambda_ of 4. lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise ############################################################################### # Fit the Bayesian Ridge Regression and an OLS for comparison clf = BayesianRidge(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) ############################################################################### # Plot true weights, estimated weights and histogram of the weights plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate") plt.plot(w, 'g-', label="Ground truth") plt.plot(ols.coef_, 'r--', label="OLS estimate") plt.xlabel("Features") plt.ylabel("Values of the weights") plt.legend(loc="best", prop=dict(size=12))
plt.title('Polynomial Predicted Coronavirus deaths Cases Over Time in India', size=30) plt.xlabel('Days Since 1/22/2020', size=20) plt.ylabel('No.of Cases(in Lakhs)', size=20) plt.legend(['deaths Cases', 'Polynomial Regression Predictions']) plt.xticks(size=15) plt.show() pol_ind_deaths_days = pol_ind_deaths_days.reshape(1,-1)[0] df_ind_deaths_poly_predict = pd.DataFrame({'Date': prediction_dates[-(days_in_future):], 'Polynomial Regression Predicted # of deaths Cases India': np.round(pol_ind_deaths_days[-(days_in_future):])}) df_ind_deaths_poly_predict #Bayesian Ridge #To get Best Parameters reg_world_deaths=BayesianRidge() reg_world_deaths.fit(xtrain_world_deaths,ytrain_world_deaths) print(reg_world_deaths.get_params) #World reg_world_deaths=BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300, normalize=False, tol=0.001, verbose=False) reg_world_deaths.fit(xtrain_world_deaths,ytrain_world_deaths) reg_world_deaths_test = reg_world_deaths.predict(xtest_world_deaths) reg_world_deaths_predict_days = reg_world_deaths.predict(prediction_days) print('MAE:', metrics.mean_absolute_error(reg_world_deaths_test, ytest_world_deaths)) print('MSE:',metrics.mean_squared_error(reg_world_deaths_test, ytest_world_deaths)) print('R2 :',metrics.r2_score(reg_world_deaths_test, ytest_world_deaths))
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]') parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file') else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(',')] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = '' if options.cuda: cuda_str = '-cuda' ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] num_filters = len(filter_consensus) # num_filters = 40 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length/2 - options.center_dist - filter_len right_i = options.seq_length/2 + options.center_dist ns_1hot = np.zeros((4,options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i] motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = '%s/motif_seqs.h5' % options.out_dir h5f = h5py.File(seqs_file, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # predict scores scores_file = '%s/motif_seqs_scores.h5' % options.out_dir torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, 'r') motif_seq_scores = np.array(hdf5_in['scores']) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0],2*num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi,i] += 1 X[xi,num_filters+j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:,ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:,ti]) # print filter coefficients coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w') for i in range(num_filters): print >> coef_out, '%3d %6.2f' % (i,model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters,num_filters)) table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w') si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j]) print >> table_out, '%3d %3d %6.3f %6.3f %6.3f' % cols si += 1 table_out.close() scores_abs = abs(filter_interaction.flatten()) max_score = stats.quantile(scores_abs, .999) print 'Limiting scores to +-%f' % max_score filter_interaction_max = np.zeros((num_filters, num_filters)) for i in range(num_filters): for j in range(num_filters): filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score]) filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score]) # plot heat map plt.figure() sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False) plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
c = clean - np.min(clean) c /= c.max() c = c.astype(bool) io.imsave("/Users/qcaudron/Desktop/charo/2_smoothed.jpg", ski.img_as_uint(surf)) # <codecell> z1 = np.mean(surf, axis=0) z2 = np.mean(surf, axis=1) #for i in range(surf.shape[1]) : # plt.plot(surf[:, i], "k") #plt.plot(z2) r = [BayesianRidge().fit(np.vander(np.arange(surf.shape[i]), 2), np.mean(surf, axis = 1-i)) for i in [0, 1]] r1 = BayesianRidge().fit(np.arange(len(z1)).reshape(len(z1),1), z1) r2 = BayesianRidge().fit(np.arange(len(z2[500:-500])).reshape(len(z2[500:-500]),1), z2[500:-500]) #plt.plot(r1.predict(np.arange(len(z1)).reshape(len(z1),1)), linewidth=5) plt.plot(r2.predict(np.arange(len(z2)).reshape(len(z2),1)), linewidth=5) plt.plot(z2, linewidth=5) #plt.axhline(b[np.argmax(h)], c="r", linewidth=3) #plt.plot(r[0].predict(np.vander(np.arange(surf.shape[0]), 2)), linewidth=3) #plt.plot(r[0].predict(np.arange(len(z1)).reshape(len(z1),1)), linewidth=3) #plt.plot(r[0].predict(np.expand_dims(np.arange(surf.shape[0]), axis=1)), linewidth=5) #plt.axhline(np.mean(z1 / r1.predict(np.arange(len(z1)).reshape(len(z1),1)))) # <codecell> lz = np.log(z2) r3 = BayesianRidge().fit(np.arange(len(lz[500:-500])).reshape(len(lz[500:-500]),1), lz[500:-500])
xt = x #print len(_x), len(x), len(y) # Linear Regression print 'linear' lr = LinearRegression() #lr.fit(x[:, np.newaxis], y) #lr_sts_scores = lr.predict(xt[:, np.newaxis]) lr.fit(x, y) lr_sts_scores = lr.predict(xt) # Baysian Ridge Regression print 'baysian ridge' br = BayesianRidge(compute_score=True) #br.fit(x[:, np.newaxis], y) #br_sts_scores = br.predict(xt[:, np.newaxis]) br.fit(x, y) br_sts_scores = br.predict(xt) # Elastic Net print 'elastic net' enr = ElasticNet() #enr.fit(x[:, np.newaxis], y) #enr_sts_scores = enr.predict(xt[:, np.newaxis]) enr.fit(x, y) enr_sts_scores = enr.predict(xt)
print('\nmethod = ', method) if (method == 1): print('Multilayer perceptron (MLP) neural network 01') str_method = 'MLP model01' r = MLPRegressor(hidden_layer_sizes=(4, ), max_iter=40) if (method == 2): print('Multilayer perceptron (MLP) neural network 02') str_method = 'MLP model02' r = MLPRegressor(hidden_layer_sizes=(5, ), max_iter=30) if (method == 3): print('Bayesian Ridge') str_method = 'BayesianRidge' r = BayesianRidge(compute_score=True) # class sklearn.ensemble.BaggingRegressor(base_estimator=None, n_estimators=10, # max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False, # oob_score=False, warm_start=False, n_jobs=1, random_state=None, verbose=0) if (method == 4): print('Bagging Regressor 01') str_method = 'BaggingRegressor01' r = BaggingRegressor( DecisionTreeRegressor(max_depth=6, max_features=0.75)) if (method == 5): print('GradientBoosting 01') str_method = 'GradientBoosting01' r = GradientBoostingRegressor(n_estimators=95,
zeroThreshold=1e-5) else: pipeline.verify(auto_X.sample(frac=0.05, random_state=13)) customize(regressor, **kwargs) store_pkl(pipeline, name + ".pkl") mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"]) store_csv(mpg, name + ".csv") build_auto( AdaBoostRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=17), "AdaBoostAuto") build_auto(ARDRegression(normalize=True), "BayesianARDAuto") build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto") build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2), "DecisionTreeAuto", compact=False) build_auto( BaggingRegressor(DecisionTreeRegressor(random_state=13, min_samples_leaf=5), random_state=13, n_estimators=3, max_features=0.5), "DecisionTreeEnsembleAuto") build_auto(DummyRegressor(strategy="median"), "DummyAuto") build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto") build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5), "ExtraTreesAuto") build_auto(GradientBoostingRegressor(random_state=13, init=None), "GradientBoostingAuto")
def main(train, test): test_ori = test test.drop(['B3', 'B13', 'A13', 'A18'], axis=1, inplace=True) good_cols = list(train.columns) for col in train.columns: rate = train[col].value_counts(normalize=True, dropna=False).values[0] if col not in ['A23']: if rate > 0.9: good_cols.remove(col) print(col, rate) good_cols.append('A1') good_cols.append('A3') good_cols.append('A4') train = train[train['收率'] > 0.87] train = train[train['收率'] <= 1] train = train[train['B14'] >= 350] train = train[train['B14'] <= 460] train = train[good_cols] good_cols.remove('收率') test = test[good_cols] target = train['收率'] del train['收率'] data = pd.concat([train, test], axis=0, ignore_index=True) data.loc[data['A25'] == '1900/3/10 0:00', 'A25'] = 70 for f in data.columns: if f != '样本id': if f in [ 'A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7', 'A20', 'A28', 'B4', 'B9', 'B10', 'B11' ]: data[f] = data[f].fillna(0) else: counts = stats.mode(data[f].astype(float))[0][0] data[f] = data[f].fillna(counts) for f in ['A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7']: try: data[f] = data[f].apply(timeTranSecond) except: print(f, '应该在前面被删除了!') for f in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']: data[f] = data.apply(lambda df: getDuration(df[f]), axis=1) for f in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']: data.loc[data[f] == 0, f] = stats.mode(data[f].astype(float))[0][0] # data['样本id'] = data['样本id'].apply(lambda x: int(x.split('_')[1])) data.drop(['样本id'], axis=1, inplace=True) categorical_columns = [f for f in data.columns if f not in ['样本id']] numerical_columns = [ f for f in data.columns if f not in categorical_columns ] data['A25'] = pd.DataFrame(data['A25'], dtype=np.float) data['b14/a1_a3_a4_a19_b1_b12'] = data['B14'] / (data['A1'] + data['A3'] + data['A4'] + data['A19'] + data['B1'] + data['B12']) data['A1_A3_A4/a1_a3_a4_a19_b1_b12'] = ( data['A1'] + data['A3'] + data['A4']) / (data['A1'] + data['A3'] + data['A4'] + data['A19'] + data['B1'] + data['B12']) data['B10_B11'] = (data['B12']) / (data['B10'] + data['B11']) data['A11_A5'] = data['A11'] - data['A5'] for f in range(len(data['A11_A5'])): if data['A11_A5'][f] < 0: data['A11_A5'][f] = data['A11_A5'][f] + 24 data['A16_A11'] = data['A16'] - data['A11'] for f in range(len(data['A16_A11'])): if data['A16_A11'][f] < 0: data['A16_A11'][f] = data['A16_A11'][f] + 24 data['A26_A24'] = (data['A26'] - data['A24']) for f in range(len(data['A26_A24'])): if data['A26_A24'][f] < 0: data['A26_A24'][f] = data['A26_A24'][f] + 24 data['A26_A24_A28'] = data['A26_A24'] / data['A28'] data['A21_A22_shijian'] = (data['A21'] + data['A22']) / data['A26_A24'] data['B7_B5'] = (data['B7'] - data['B5']) for f in range(len(data['B7_B5'])): if data['B7_B5'][f] < 0: data['B7_B5'][f] = data['B7_B5'][f] + 24 data['B14/B7_B5'] = data['B14'] / data['B7_B5'] # data['B11*B14'] = data['B11'] * data['B14'] # numerical_columns.append('B11*B14') numerical_columns.append('b14/a1_a3_a4_a19_b1_b12') numerical_columns.append('A21_A22_shijian') for l in [ 'A1', 'A3', 'A4', 'A7', 'A5', 'A11', 'A9', 'A14', 'A16', 'A21', 'A22', 'A20', 'A26', 'A24', 'A28', 'A23', 'B8', 'B6', 'A17' ]: data.drop([l], axis=1, inplace=True) categorical_columns.append('B14/B7_B5') categorical_columns.append('A26_A24_A28') categorical_columns.append('A16_A11') categorical_columns.append('B10_B11') categorical_columns.append('A11_A5') for l in [ 'A1', 'A3', 'A4', 'A7', 'A5', 'A11', 'A9', 'A14', 'A16', 'A21', 'A22', 'A20', 'A26', 'A24', 'A28', 'A23', 'B8', 'B6', 'A17' ]: categorical_columns.remove(l) for f in categorical_columns: data[f] = data[f].map( dict(zip(data[f].unique(), range(0, data[f].nunique())))) train = data[:train.shape[0]] test = data[train.shape[0]:] print(train.shape) print(test.shape) train['target'] = target train['intTarget'] = pd.cut(train['target'], 5, labels=False) train = pd.get_dummies(train, columns=['intTarget']) li = [ 'intTarget_0.0', 'intTarget_1.0', 'intTarget_2.0', 'intTarget_3.0', 'intTarget_4.0' ] mean_columns = [] for f1 in categorical_columns: cate_rate = train[f1].value_counts(normalize=True, dropna=False).values[0] if cate_rate < 0.90: for f2 in li: col_name = 'B14_to_' + f1 + "_" + f2 + '_mean' mean_columns.append(col_name) order_label = train.groupby([f1])[f2].mean() train[col_name] = train['B14'].map(order_label) miss_rate = train[col_name].isnull().sum( ) * 100 / train[col_name].shape[0] if miss_rate > 0: train = train.drop([col_name], axis=1) mean_columns.remove(col_name) else: test[col_name] = test['B14'].map(order_label) train.drop(li + ['target'], axis=1, inplace=True) print(train.shape) print(test.shape) X_train = train[mean_columns + numerical_columns].values X_test = test[mean_columns + numerical_columns].values enc = OneHotEncoder() for f in categorical_columns: enc.fit(data[f].values.reshape(-1, 1)) X_train = sparse.hstack( (X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr') X_test = sparse.hstack( (X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr') print(X_train.shape) print(X_test.shape) y_train = target.values param = { 'num_leaves': 120, 'min_data_in_leaf': 30, 'objective': 'regression', 'max_depth': -1, 'learning_rate': 0.05, "min_child_samples": 30, "boosting": "gbdt", "feature_fraction": 0.9, "bagging_freq": 1, "bagging_fraction": 0.9, "bagging_seed": 11, "metric": 'mse', "lambda_l1": 0.1, "verbosity": -1 } folds = KFold(n_splits=8, shuffle=True, random_state=2018) oof_lgb = np.zeros(len(train)) predictions_lgb = np.zeros(len(test)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)): print("fold n°{}".format(fold_ + 1)) trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx]) val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx]) num_round = 10000 clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=200, early_stopping_rounds=100) oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration) predictions_lgb += clf.predict( X_test, num_iteration=clf.best_iteration) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, target))) xgb_params = { 'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 4 } folds = KFold(n_splits=5, shuffle=True, random_state=2018) oof_xgb = np.zeros(len(train)) predictions_xgb = np.zeros(len(test)) for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)): print("fold n°{}".format(fold_ + 1)) trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx]) val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx]) watchlist = [(trn_data, 'train'), (val_data, 'valid_data')] clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params) oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit) predictions_xgb += clf.predict( xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target))) # # stacking train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y_train)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 print("CV score: {:<8.8f}".format( mean_squared_error(target.values, oof_stack))) # print("CV score: {:<8.8f}".format(mean_squared_error(ansc['收率'], predictions_xgb))) # print(predictions_xgb) sub_df = pd.DataFrame({'a': test_ori['样本id'], 'b': predictions}) # sub_df['b']= sub_df['b'].apply(lambda x: round(x, 3)) return sub_df
def stack_model(self, prediction_list_name, method = "BayesianRidge", split_method = "kFold", n_splits = 5, random_state = 4520): target, test_df= self.get_train_target() if len(prediction_list_name) == 0: print("no prediction result ...") return else: oof_list = [] prediction_list = [] for name in prediction_list_name: pred_path = os.path.join(self.submission_dir, name) oof_path = os.path.join(self.submission_dir+'/oof', 'oof_'+name) if not os.path.isfile(pred_path): print("{} is not a prediction result path".format(pred_path)) elif not os.path.isfile(oof_path): print("{} is not a oof result path".format(oof_path)) else: oof = pd.read_csv(oof_path) prediction = pd.read_csv(pred_path) prediction_list.append(prediction['target'].values) oof_list.append(oof['target'].values) train_stack = np.vstack(oof_list).transpose() test_stack = np.vstack(prediction_list).transpose() if split_method == 'kFold': kfold = KFold(n_splits=n_splits, random_state=random_state) iterator = enumerate(kfold.split(train_stack)) elif split_method == 'StratifiedKFold': kfold = StratifiedKFold(n_splits=n_splits, random_state=random_state) iterator = enumerate(kfold.split(train_stack,target.values)) oof_stack = np.zeros(train_stack.shape[0]) predictions_stack = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(kfold.split(train_stack, target)): print("fold n°{}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values print("-" * 10 + "Stacking " + str(fold_) + "-" * 10) # cb_model = CatBoostRegressor(iterations=3000, learning_rate=0.1, depth=8, l2_leaf_reg=20, bootstrap_type='Bernoulli', eval_metric='RMSE', metric_period=50, od_type='Iter', od_wait=45, random_seed=17, allow_writing_files=False) # cb_model.fit(trn_data, trn_y, eval_set=(val_data, val_y), cat_features=[], use_best_model=True, verbose=True) if method == 'BayesianRidge': clf = BayesianRidge() clf.fit(trn_data, trn_y) oof_stack[val_idx] = clf.predict(val_data) predictions_stack += clf.predict(test_stack) / 5 print("cv score : ",np.sqrt(mean_squared_error(target.values, oof_stack))) print('save stacked oof file and prediction file ...') oof_file_name = '_'.join(prediction_list_name).strip() oof_file_name = 'oof_merge_'+oof_file_name pred_file_name = '_'.join(prediction_list_name).strip() pred_file_name = 'merge_'+pred_file_name stack_result = pd.DataFrame({'card_id':test_df['card_id']}) stack_result['target'] = predictions_stack stack_result.to_csv(os.path.join(self.submission_dir,pred_file_name), index=False) oof_stack = pd.DataFrame({'target': oof_stack}) oof_stack.to_csv(os.path.join(self.submission_dir + '/oof', 'oof_' + oof_file_name), index=False) print('stacked oof and prediction file save successfully ...')
def test3(): name = request.form["name"] target = request.form["target"] test_size = request.form["test_size"] dataset = request.files["dataset"] df = pd.read_csv(dataset) #directory making rootdirectory = name parent_dir = "/home/sanfer/Documents/ml-examples-vuejs-flask/web-app/src/assets/" path = os.path.join(parent_dir, rootdirectory) working = path #working path os.mkdir(path) plotdirectory = "plots" plot_parent_dir = parent_dir + rootdirectory + '/' path = os.path.join(plot_parent_dir, plotdirectory) plots_dir = path #plot path os.mkdir(path) modeldirectory = "models" model_parent_dir = parent_dir + rootdirectory + "/" path = os.path.join(model_parent_dir, modeldirectory) model_dir = path #model path os.mkdir(path) #pre-processiong plots snsdist = sns.distplot(df[target]) snsdist = snsdist.get_figure() snsdist.savefig(plots_dir + "/dist.png") snsdist.clf() features = {} dataTypes = df.dtypes for items in dataTypes.iteritems(): # print(items) # print((items[1].name)) if (items[1].name != 'float64' and items[1].name != 'int64'): df.drop(labels=items[0], axis=1, inplace=True) else: features.update({items[0]: items[1].name}) del features[target] features = json.dumps(features) y = df[target] df.drop(labels=target, axis=1, inplace=True) df.replace(0, np.NaN).fillna(df.mean(), inplace=True) X = df[list(df.columns)] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=float(test_size), random_state=101) from sklearn.linear_model import BayesianRidge lm = BayesianRidge() lm.fit(X_train, y_train) print("Linear model intercept") print(lm.intercept_) coeff_df = pd.DataFrame(lm.coef_, X.columns, columns=['Coefficient']) print(coeff_df) predictions = lm.predict(X_test) # plt.figure() plt.scatter(y_test, predictions) plt.savefig(plots_dir + "/scatter.png") plt.clf() sn = sns.distplot((y_test - predictions), bins=50) sn = sn.get_figure() sn.savefig(plots_dir + "/residual.png") sn.clf() # plt.show() # cv2.waitKey(0) # sns.distplot((y_test-predictions),bins=50); from sklearn import metrics print('MAE:', metrics.mean_absolute_error(y_test, predictions)) print('MSE:', metrics.mean_squared_error(y_test, predictions)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) print(test_size) print(name) print(features) pkl_filename = model_dir + "/" + name + ".pkl" with open(pkl_filename, 'wb') as file: pickle.dump(lm, file) #metrics to return r_square = lm.score(X, y) MAE = metrics.mean_absolute_error(y_test, predictions) MSE = metrics.mean_squared_error(y_test, predictions) RMSE = np.sqrt(MSE) #plot paths to return scatterplotpath = name + "/plots/scatter.png" distpath = name + "/plots/dist.png" residualpath = name + "/plots/residual.png" #path to model modelpath = name + "/models/" + name + ".pkl" return jsonify({ "status": "success LinearReg", "metrics": { "mae": MAE, "mse": MSE, "rmse": RMSE, "r_square": r_square }, "ploturl": { "scatterplotpath": scatterplotpath, "distpath": distpath, "residualpath": residualpath }, "feature_names": features, "model_path": modelpath }), 201
y_train = y_train[['Target']] print(y_train) y_test = y_test[['Target']] # corr = data.corr() # param_grid = {'C': [4.7, 4.8, 4.9, 5.0], 'gamma': [ 0.000009, 0.000010, 0.000011, 0.000012]} print(X_train) print(y_train) # regressor = LinearRegression() # regressor = SVR(C=5, gamma=0.00001) regressor = BayesianRidge(normalize=True, n_iter=5, tol=0.01, fit_intercept=True) # regressor = ARDRegression(normalize=True, n_iter=5, tol=0.01) # regressor = SGDRegressor() # regressor = MLPRegressor(hidden_layer_sizes=(200, 50, 10)) # regressor = RANSACRegressor(min_samples=80, max_trials=1000) # regressor = Lasso() regressor.fit(X_train, y_train.squeeze().tolist()) print(regressor.score(X_train, y_train.squeeze().tolist())) print(regressor.score(X_test, y_test.squeeze().tolist())) print(regressor.get_params()) y_predict = regressor.predict(X_test) print(y_predict)
trainingcounts = counts[100:] testcounts = counts[:100] trainingrates = countrates[100:] testrates = countrates[:100] trainingtimes = times[100:] testtimes = times[:100] # using trainingcounts and training hists use log linear #poisson_model = sm.GLM(trainingrates, # sm.tools.tools.add_constant(traininghists), # family =sm.families.Poisson(sm.genmod.families.links.log)) #results = poisson_model.fit() #print(results.summary()) #x = results.predict(sm.tools.tools.add_constant(testhists)) clf = BayesianRidge(compute_score=True) clf.fit(traininghists,trainingrates) x = clf.predict(testhists) answer = testrates plt.plot(bins,x) plt.plot(bins,answer) plt.show()
# # LinearRegression # Ridge # Lasso # Random Forrest # Gradient Boosting Tree # Support Vector Regression # Linear Support Vector Regression # ElasticNet # Stochastic Gradient Descent # BayesianRidge # KernelRidge # ExtraTreesRegressor # XgBoost models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(), ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5), ExtraTreesRegressor(),XGBRegressor()] names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"] # for name, model in zip(names, models): # score = rmse_cv(model, X_scaled, y_log) # print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std())) # grid函数寻找最优参数 class grid(): def __init__(self, model): self.model = model def grid_get(self, X, y, param_grid): grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring="neg_mean_squared_error") grid_search.fit(X, y)
return X_train, X_test, y, y_test, snr, noise, w, size ############################################################################### # Create data size = 12 n_samples = 400 X_train, X_test, y_train, y_test, snr, noise, coefs, size =\ create_simulation_data(snr=10, n_samples=n_samples, size=size) ############################################################################### # Compute the results for supervised clustering A = grid_to_graph(n_x=size, n_y=size, n_z=size) clf = BayesianRidge(fit_intercept=True, normalize=True, tol=1.e-3) sc = supervised_clustering.SupervisedClusteringRegressor(estimator=clf, connectivity=A, n_iterations=30, cv=25, verbose=1, n_jobs=8) #sc = supervised_clustering.SupervisedClusteringRegressor(clf, connectivity=A, # n_iterations=30, verbose=1, n_jobs=8, # cv=ShuffleSplit(X_train.shape[0], n_splits=10, test_fraction=0.6, # random_state=0)) t1 = time() sc.fit(X_train, y_train) sc_time = time() -t1 computed_coefs = sc.inverse_transform() computed_coefs = np.reshape(computed_coefs, [size, size, size]) score = sc.score(X_test, y_test) ###############################################################################
# total1.shape # 最后,所有数据预处理完毕,后进行数据train/test进行分离。 train = total1[total1['source'] == 'train'] test = total1[total1['source'] == 'test'] train.drop(['source'], axis=1, inplace=True) test.drop(['source'], axis=1, inplace=True) # train.shape, test.shape # 模型预测 # 首先选择一些基本模型进行单模型数据仿真。选择比较有代表性的线性模型、随机森林、GDBR以及最近比较流行的XGBoost。 # 模型评估函数选择Kaggle官网指定的评估指标,均方根误差。 lass = Lasso(alpha=0.1) bayes = BayesianRidge(n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06) regr = RandomForestRegressor(max_depth=2) gbr = GradientBoostingRegressor( loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, alpha=0.9, ) model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,
frames = [train, test] df = pd.concat(frames, axis=0, ignore_index=True) ### Imputing DYAR train = df[(df.DYAR.isnull() ==False) & (df.pct_team_tgts.isnull() == False)] train.reset_index(inplace=True, drop=True) test = df[(df.DYAR.isnull() == True) & (df.pct_team_tgts.isnull() == False)] test.reset_index(inplace= True, drop=True) features = ['targets', 'receptions', 'rec_tds', 'start_ratio', 'pct_team_tgts', 'pct_team_receptions', 'pct_team_touchdowns', 'rec_yards', 'dpi_yards', 'fumbles', 'first_down_ctchs', 'pct_of_team_passyards'] X = scale(train[features]) y = train.DYAR # Our best model for predicting DYAR was a Bayesian Ridge Regressor br = BayesianRidge() br.fit(X,y) dyar_predictions = pd.DataFrame(br.predict(scale(test[features])), columns = ['DYAR_predicts']) test = test.join(dyar_predictions) test['DYAR'] = test['DYAR_predicts'] test.drop('DYAR_predicts', inplace=True, axis=1) frames = [train,test] df = pd.concat(frames, axis=0, ignore_index=True) ### Imputing EYds train = df[(df.EYds.isnull() ==False) & (df.pct_team_tgts.isnull() == False)] train.reset_index(inplace=True, drop=True) test = df[(df.EYds.isnull() == True) & (df.pct_team_tgts.isnull() == False)] test.reset_index(inplace= True, drop=True)
# Create weights with a precision lambda_ of 4. lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise # ############################################################################# # Fit the Bayesian Ridge Regression and an OLS for comparison clf = BayesianRidge(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) # ############################################################################# # Plot true weights, estimated weights, histogram of the weights, and # predictions with standard deviations lw = 2 plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, color='lightgreen', linewidth=lw, label="Bayesian Ridge estimate") plt.plot(w, color='gold', linewidth=lw, label="Ground truth") plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate")
def do_validation(data_path, steps=10): allfiles = initialize(data_path) gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5) ada = AdaBoostRegressor(n_estimators=200, learning_rate=1) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5) rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5) kn = KNeighborsRegressor(n_neighbors=25) logit = LogisticRegression(tol=0.05) enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05) svr = SVR(kernel="linear", probability=True) ridge = Ridge(alpha=18) bridge = BayesianRidge(n_iter=500) gbm_metrics = 0.0 ada_metrics = 0.0 etree_metrics = 0.0 rf_metrics = 0.0 kn_metrics = 0.0 logit_metrics = 0.0 svr_metrics = 0.0 ridge_metrics = 0.0 bridge_metrics = 0.0 enet_metrics = 0.0 nnet_metrics = 0.0 logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) for i in xrange(steps): driver = allfiles[i] df, Y = create_merged_dataset(driver) df['label'] = Y # Shuffle DF. df = df.reindex(np.random.permutation(df.index)) train = df[:100] label = train['label'] del train['label'] test = df[100:400] Y = test['label'] del test['label'] #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', # 'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', # 'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', # 'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', # 'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', # 'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', # 'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', # 'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80'] to_drop = ['driver', 'trip'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) gbm.fit(X_train, label) Y_hat = gbm.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) gbm_metrics += metrics.auc(fpr, tpr) ada.fit(X_train, label) Y_hat = ada.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ada_metrics += metrics.auc(fpr, tpr) etree.fit(X_train, label) Y_hat = etree.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) etree_metrics += metrics.auc(fpr, tpr) rf.fit(X_train, label) Y_hat = rf.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) rf_metrics += metrics.auc(fpr, tpr) kn.fit(X_train, label) Y_hat = kn.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) kn_metrics += metrics.auc(fpr, tpr) # Linear models. to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed', 'sd_avg_speed', 'mean_inst_speed', 'points'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) logit.fit(X_train, label) Y_hat = [i[1] for i in logit.predict_proba(X_test)] fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) logit_metrics += metrics.auc(fpr, tpr) svr.fit(X_train, label) Y_hat = svr.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) svr_metrics += metrics.auc(fpr, tpr) ridge.fit(X_train, label) Y_hat = ridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ridge_metrics += metrics.auc(fpr, tpr) bridge.fit(X_train, label) Y_hat = bridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) bridge_metrics += metrics.auc(fpr, tpr) enet.fit(X_train, label) Y_hat = enet.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) enet_metrics += metrics.auc(fpr, tpr) classifier.fit(X_train, label) Y_hat = classifier.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) nnet_metrics += metrics.auc(fpr, tpr) print "" print "GBM:", gbm_metrics/steps print "AdaBoost:", ada_metrics/steps print "Extra Trees:", etree_metrics/steps print "RF:", rf_metrics/steps print "KN:", kn_metrics/steps print "" print "Logit:", logit_metrics/steps print "SVR:", svr_metrics/steps print "Ridge:", ridge_metrics/steps print "BayesianRidge:", bridge_metrics/steps print "Elastic Net:", enet_metrics/steps print "Neural Networks:", nnet_metrics/steps print ""
X = np.random.randn(n_samples, size**2) for x in X: # smooth data x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel() X -= X.mean(axis=0) X /= X.std(axis=0) y = np.dot(X, coef.ravel()) noise = np.random.randn(y.shape[0]) noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2) y += noise_coef * noise # add noise ############################################################################### # Compute the coefs of a Bayesian Ridge with GridSearch cv = KFold(len(y), 2) # cross-validation generator for model selection ridge = BayesianRidge() mem = Memory(cachedir='.', verbose=1) # Ward agglomeration followed by BayesianRidge A = grid_to_graph(n_x=size, n_y=size) ward = WardAgglomeration(n_clusters=10, connectivity=A, memory=mem, n_components=1) clf = Pipeline([('ward', ward), ('ridge', ridge)]) # Select the optimal number of parcels with grid search clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv) clf.fit(X, y) # set the best parameters coef_ = clf.best_estimator.steps[-1][1].coef_ coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_) coef_agglomeration_ = coef_.reshape(size, size)
def sale(data): data = int(data) + 1 return log(data) dataset = pandas.read_csv("input/train2_.csv") testset = pandas.read_csv("input/test2_.csv") dataset['Sale'] = dataset['Sales'].apply(sale) labelData = dataset['Sale'].values myId = testset['Id'].values testset.drop(['Id'], inplace=True, axis=1) testData = testset.iloc[:, :].values dataset.drop(['Sales', 'Sale'], inplace=True, axis=1) dataData = dataset.iloc[:, :].values BRModel = BayesianRidge(compute_score=True) BRModel.fit(dataset.iloc[:, :].values, labelData) preds = numpy.column_stack((myId, BRModel.predict(testData))).tolist() preds = [[int(i[0])] + [exp(float(i[1])) - 1] for i in preds] print BRModel.scores_ with open("result/sub_BayesRidge.csv", "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(["Id", "Sales"]) writer.writerows(preds)
def calculate_score(deg, X, y): pipe = make_pipeline(StandardScaler(), PolynomialFeatures(deg), BayesianRidge(normalize=False)) # type: Pipeline pipe.fit(X, y) return pipe.score(X, y)
ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_] if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == 2 * (d - 1) @pytest.mark.parametrize( "predictor", [DummyRegressor(), BayesianRidge(), ARDRegression()]) def test_chained_imputer_predictors(predictor): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = ChainedImputer(missing_values=0, n_imputations=1, n_burn_in=1, predictor=predictor, random_state=rng) imputer.fit_transform(X)
from preprocess import preprocess from kaggle_util import make_submission_with_model, get_kaggle_scores, submit_to_kaggle def house_prices_preprocess(root): target_column = 'SalePrice' columns_to_drop = ['Id'] forced_categorical = [] forced_numeric = [] columns_to_normalize = [target_column] use_labeler = [] def manual_processing(features, complete_features): return features preprocess(root, target_column, columns_to_drop, forced_categorical, forced_numeric, columns_to_normalize, use_labeler, manual_processing) if __name__ == '__main__': competition = 'house-prices-advanced-regression-techniques' root = f'C:/data/{competition}/' print(get_kaggle_scores(competition)) from sklearn.linear_model import BayesianRidge model = BayesianRidge() path = make_submission_with_model(model, root) #submit_to_kaggle(path, competition)
def Ridge_Regression(): model = BayesianRidge(compute_score=True) return model
# 将lgb和xgb的结果进行stacking train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 mean_squared_error(target.values, oof_stack) sub_df = pd.read_csv('datalab/7955/jinnan_round1_submit_20181227.csv', header=None) sub_df[1] = predictions sub_df[1] = sub_df[1].apply(lambda x: round(x, 3)) def modeling_cross_validation(params, X, y, nr_folds=5): oof_preds = np.zeros(X.shape[0])
def model_BayesianRidge(train_X, test_X): "调用贝叶斯回归" #训练样本标签为act_class train_y = train_X['class'].values #训练样本,需要drop掉标签class,即类别标签不加入模型的训练 train_x = train_X.drop(['class'], axis=1).values #测试样本,需要drop掉标签class test_x = test_X.drop(['class'], axis=1).values "模型1" #贝叶斯回归进行预测 clf1 = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.001, verbose=False) clf1 = clf1.fit(train_x, train_y) #获得预测结果 test1 = clf1.predict(test_x) train1 = clf1.predict(train_x) #转为DataFrame类型 test1 = pd.DataFrame(test1) train1 = pd.DataFrame(train1) "模型2" #贝叶斯回归进行预测 clf2 = BayesianRidge(alpha_1=1e-05, alpha_2=1e-05, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-05, lambda_2=1e-05, n_iter=400, normalize=False, tol=0.001, verbose=False) clf2 = clf2.fit(train_x, train_y) #获得预测结果 test2 = clf2.predict(test_x) train2 = clf2.predict(train_x) #转为DataFrame类型 test2 = pd.DataFrame(test2) train2 = pd.DataFrame(train2) #合并两个模型 test = pd.concat([test1, test2], axis=1) train = pd.concat([train1, train2], axis=1) print('------贝叶斯回归单模型结果-------') print("Mean squared error: %.2f" % mean_squared_error(test_X['class'], test1)) print('Variance score: %.2f' % r2_score(test_X['class'], test1)) print("Mean squared error: %.2f" % mean_squared_error(test_X['class'], test2)) print('Variance score: %.2f' % r2_score(test_X['class'], test2)) print('\n') return test, train
for index, row in tt.iterrows(): if pd.isnull(row['Age']): for key in avg_age.keys(): if key in row['Name']: tt.loc[index,"Age"] = avg_age[key] #-------------------------------------------------------------------------------- # X = td.loc[:,['Sex','Age', 'Fare','SibSp','Parch','Pclass']].values X = np.where(np.isnan(X), -1, X) X_ = tt.loc[:,['Sex','Age', 'Fare','SibSp','Parch', 'Pclass']].values X_ = np.where(np.isnan(X_), -1, X_) Y = td['Survived'].values clf = BayesianRidge(lambda_1=10**-4, lambda_2=10**-4, alpha_1=10**2.75,alpha_2=10**3.3, compute_score=True) #0.78947 model = clf.fit(X, Y) #Result predict_result = model.predict(X_).round(0).astype(int) result = pd.DataFrame.from_items([('PassengerId',tt['PassengerId']), ('Survived',predict_result)]) result.to_csv('result/bayes_result.csv', index=False) t1 = pd.read_csv("result/elastic_result_077512.csv") t2 = pd.read_csv("result/bayes_result.csv") t3 = t1 == t2 i = 0 for index, row in t3.iterrows(): if row['Survived'] == False: i += 1
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签 plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号 # 对稀疏数据做标准化,不能采用中心化的方式,否则会破坏稀疏数据的结构.比如此文件的boston数据集 size = 500 # train size df = pd.DataFrame(pd.read_excel('boston_data.xlsx', header=0)) # np.random.shuffle() training_data_input = df.values[:, 0:13][:size] # 500*13 training_data_output = df.values[:, 13:14][:size].ravel() # .dtype打印出数据类型,.ravel() 返回一维的数组 test_data_input = df.values[:, 0:13][size:] test_data_output = df.values[:, 13:14][size:].ravel() # float64 n_folds = 6 # 设置交叉检验的次数 model_br = BayesianRidge() # 建立贝叶斯岭回归模型对象 model_lr = LinearRegression() # 建立普通线性回归模型对象 model_etc = ElasticNet() # 建立弹性网络回归模型对象 model_svr = SVR() # 建立支持向量机回归模型对象 model_gbr = GradientBoostingRegressor() # 建立梯度增强算法回归模型对象 model_mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=(20, 20, 20), random_state=1) model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR', 'MLP'] # 不同模型的名称列表 model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr, model_mlp] # 不同回归模型对象的集合 cv_score_list = [] # 交叉检验结果列表 pre_y_list = [] # 各个回归模型预测的y值列表 for model in model_dic: # 读出每个回归模型对象 # 将每个回归模型导入交叉检验模型中做训练检验 scores = cross_val_score(model, training_data_input, training_data_output, cv=n_folds) # 模型与数据间的距离,并非越大越好 cv_score_list.append(scores) pre_y_list.append(
def get_algorithm(self): ''' Inputs: algorithm (string) - Name of the regressor to run. Follows Sklearn naming conventions. Available keys: ARDRegression | AdaBoostRegressor | BaggingRegressor | BayesianRidge | CCA DecisionTreeRegressor | ElasticNet | ExtraTreeRegressor ExtraTreesRegressor | GaussianProcessRegressor | GradientBoostingRegressor HuberRegressor | KNeighborsRegressor | KernelRidge | Lars | Lasso LassoLars | LinearRegression | LinearSVR | MLPRegressor | NuSVR | OrthogonalMatchingPursuit | PLSCanonical | PLSRegression | PassiveAggressiveRegressor | RANSACRegressor | RandomForestRegressor | Ridge | SGDRegressor | SVR | TheilSenRegressor | TransformedTargetRegressor Currently not supporting: ElasticNetCV | LarsCV | LassoCV | LassoLarsCV | LassoLarsIC | MultiTaskElasticNet | MultiTaskElasticNetCV | MultiTaskLasso | MultiTaskLassoCV | OrthogonalMatchingPursuitCV | RidgeCV | RadiusNeighborsRegressor Outputs: Notes: Scoring Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter ''' if (self.algorithmName == "ARDRegression"): algorithm = ARDRegression() elif (self.algorithmName == "AdaBoostRegressor"): algorithm = AdaBoostRegressor() elif (self.algorithmName == "BaggingRegressor"): algorithm = BaggingRegressor() elif (self.algorithmName == "BayesianRidge"): algorithm = BayesianRidge() elif (self.algorithmName == "CCA"): algorithm = CCA() elif (self.algorithmName == "DecisionTreeRegressor"): algorithm = DecisionTreeRegressor() elif (self.algorithmName == "ElasticNet"): algorithm = ElasticNet() elif (self.algorithmName == "ExtraTreeRegressor"): algorithm = ExtraTreeRegressor() elif (self.algorithmName == "ExtraTreesRegressor"): algorithm = ExtraTreesRegressor() elif (self.algorithmName == "GaussianProcessRegressor"): algorithm = GaussianProcessRegressor() elif (self.algorithmName == "GradientBoostingRegressor"): algorithm = GradientBoostingRegressor() elif (self.algorithmName == "HuberRegressor"): algorithm = HuberRegressor() elif (self.algorithmName == "KNeighborsRegressor"): algorithm = KNeighborsRegressor() elif (self.algorithmName == "KernelRidge"): algorithm = KernelRidge() elif (self.algorithmName == "Lars"): algorithm = Lars() elif (self.algorithmName == "Lasso"): algorithm = Lasso() elif (self.algorithmName == "LassoLars"): algorithm = LassoLars() elif (self.algorithmName == "LinearRegression"): algorithm = LinearRegression() elif (self.algorithmName == "LinearSVR"): algorithm = LinearSVR() elif (self.algorithmName == "MLPRegressor"): algorithm = MLPRegressor() elif (self.algorithmName == "NuSVR"): algorithm = NuSVR() elif (self.algorithmName == "OrthogonalMatchingPursuit"): algorithm = OrthogonalMatchingPursuit() elif (self.algorithmName == "PLSCanonical"): algorithm = PLSCanonical() elif (self.algorithmName == "PLSRegression"): algorithm = PLSRegression() elif (self.algorithmName == "PassiveAggressiveRegressor"): algorithm = PassiveAggressiveRegressor() elif (self.algorithmName == "RANSACRegressor"): algorithm = RANSACRegressor() elif (self.algorithmName == "RandomForestRegressor"): algorithm = RandomForestRegressor() elif (self.algorithmName == "Ridge"): algorithm = Ridge() elif (self.algorithmName == "SGDRegressor"): algorithm = SGDRegressor() elif (self.algorithmName == "SVR"): algorithm = SVR() elif (self.algorithmName == "TheilSenRegressor"): algorithm = TheilSenRegressor() elif (self.algorithmName == "TransformedTargetRegressor"): algorithm = TransformedTargetRegressor() else: return None return algorithm
if imputation_order == 'roman': assert np.all(ordered_idx[:d - 1] == np.arange(1, d)) elif imputation_order == 'arabic': assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1)) elif imputation_order == 'random': ordered_idx_round_1 = ordered_idx[:d - 1] ordered_idx_round_2 = ordered_idx[d - 1:] assert ordered_idx_round_1 != ordered_idx_round_2 elif 'ending' in imputation_order: assert len(ordered_idx) == max_iter * (d - 1) @pytest.mark.parametrize( "estimator", [None, DummyRegressor(), BayesianRidge(), ARDRegression(), RidgeCV()]) def test_iterative_imputer_estimators(estimator): rng = np.random.RandomState(0) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, estimator=estimator, random_state=rng) imputer.fit_transform(X)
# separate the data into training and testing if TIME_SERIES: test_idx = X.index.values[-int(X.shape[0] / 5):] else: np.random.seed(1) test_idx = np.random.choice(a=X.index.values, size=int(X.shape[0] / 5), replace=False) train_idx = np.array(list(set(X.index.values) - set(test_idx))) # set up the model if classifier: model = MultiOutputClassifier(GaussianNB()) else: model = MultiOutputRegressor(BayesianRidge(n_iter=300)) # train the model model.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :]) # In[2]: Collect the predictions # predict training and testing data train_predict = pd.DataFrame(model.predict(X.iloc[train_idx, :]), columns=Y.columns) test_predict = pd.DataFrame(model.predict(X.iloc[test_idx, :]), columns=Y.columns) # reshape all of the predictions into a single table predictions = pd.DataFrame() for j in range(outputs):
def main(): usage = "usage: %prog [options] <model_file>" parser = OptionParser(usage) parser.add_option( "-c", dest="center_dist", default=10, type="int", help="Distance between the motifs and sequence center [Default: %default]", ) parser.add_option( "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]" ) parser.add_option( "-g", dest="cuda", default=False, action="store_true", help="Run on the GPGPU [Default: %default]" ) parser.add_option("-l", dest="seq_length", default=600, type="int", help="Sequence length [Default: %default]") parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]") parser.add_option( "-t", dest="targets", default="0", help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide Basset model file") else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(",")] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = "" if options.cuda: cuda_str = "-cuda" ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] # num_filters = len(filter_consensus) num_filters = 20 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length / 2 - options.center_dist - filter_len right_i = options.seq_length / 2 + options.center_dist ns_1hot = np.zeros((4, options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:, left_i : left_i + filter_len] = filter_consensus[i] motifs_seq[:, right_i : right_i + filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = "%s/motif_seqs.h5" % options.out_dir h5f = h5py.File(seqs_file, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() # predict scores scores_file = "%s/motif_seqs_scores.h5" % options.out_dir torch_cmd = "th basset_place2_predict.lua %s %s %s %s" % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, "r") motif_seq_scores = np.array(hdf5_in["scores"]) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0], 2 * num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi, i] += 1 X[xi, num_filters + j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:, ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:, ti]) # print filter coefficients coef_out = open("%s/coefs_t%d.txt" % (options.out_dir, ti), "w") for i in range(num_filters): print >> coef_out, "%3d %6.2f" % (i, model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters, num_filters)) table_out = open("%s/table_t%d.txt" % (options.out_dir, ti), "w") si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i, j] = motif_seq_scores[si, ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si, ti], motif_seq_preds[si], filter_interaction[i, j]) print >> table_out, "%3d %3d %6.3f %6.3f %6.3f" % cols si += 1 table_out.close() # plot heat map plt.figure() sns.heatmap(filter_interaction) plt.savefig("%s/heat_t%d.pdf" % (options.out_dir, ti))
from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import BayesianRidge from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score import numpy as np from dataprepHousing import getData import os, sys full_path = os.path.realpath(__file__) file = os.path.dirname(full_path) + "\\\data\\housingSample.csv" (X,Y,records)=getData(file) X_train, X_test, price_train, price_test = train_test_split(X, Y, test_size = 0.1, random_state = 42) model=BayesianRidge() model.fit(X_train, price_train.ravel()) predPrices=model.predict(X_train) print(model) # Summarize the fit of the model #print(model.intercept_, model.coef_, mse) print(model.score(X_train, price_train)) predPrices=model.predict(X_train) mse=mean_squared_error(price_train, predPrices) rs=r2_score(price_train, predPrices) print("training mse:",mse) print("training score:",rs)
def nickmain1(): train_all = pd.read_csv(trainloc) target_all = pd.read_csv(trainloc) test_all = pd.read_csv(testloc) targets = ['Ca','P','pH','SOC','Sand'] train_cols_to_remove = ['PIDN']+targets train_all["Depth"] = train_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10]) test_all["Depth"] = test_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10]) common_features = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI'] feats_list = {} colnames_nums = [] colnames = train_all.ix[:,'m7497.96':'m599.76'].columns.values for x in colnames: match = re.search(r'(?<=m)[0-9]*',x) if match: colnames_nums.append(int(match.group())) print len(colnames) print len(colnames_nums) print len(train_all.ix[0,'m7497.96':'m599.76'].values) for target in targets: selector = SelectKBest(f_regression, k=200) selector.fit_transform(train_all.ix[:,'m7497.96':'m599.76'], train_all[target]) selected = selector.get_support() feats = [col for (col,sel) in zip(list(train_all.ix[:,'m7497.96':'m599.76'].columns.values), selected) if sel] feats_list[target] = feats+common_features #pickTest = ['PIDN', 'BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI','Depth']#ORIGINAL10 ids = np.genfromtxt(testloc, dtype=str, skip_header=1, delimiter=',', usecols=0) df = pd.DataFrame({"PIDN": ids, "Ca": test_all['PIDN'], "P": test_all['PIDN'], "pH": test_all['PIDN'], "SOC": test_all['PIDN'], "Sand": test_all['PIDN']}) cv = cross_validation.KFold(len(train_all), n_folds=10, indices=False) subresults = {} results = [] if issub == False: for train_sub, test_sub in cv: for target in targets: #clf = ensemble.GradientBoostingRegressor(n_estimators=6) #clf = RandomForestRegressor(n_estimators = 40) #clf = linear_model.Lasso(alpha=0.08) #clf = svm.SVC() #clf = tree.DecisionTreeRegressor(min_samples_leaf=20) #clf = Ridge(alpha=1.0) #clf = ElasticNet(alpha=0.1, l1_ratio=0.7) clf = BayesianRidge(compute_score=True) clf.fit(np.array(train_all[feats_list[target]])[train_sub], np.array(train_all[target])[train_sub]) pred = clf.predict(np.array(train_all[feats_list[target]])[test_sub]) subresults[target] = ev.rmse(np.array(train_all[target])[test_sub],np.array(pred)) #df[target] = pred subtotal = 0 for x in subresults: subtotal = subtotal + subresults[x] print ("average for the run is ", subtotal/len(targets)) results.append(subtotal/len(targets)) print "Results: " + str( np.array(results).mean() ) else: for target in targets: #clf = ensemble.GradientBoostingRegressor(n_estimators=6) #clf = RandomForestRegressor(n_estimators = 20) #clf = linear_model.Lasso(alpha=0.08) #clf = svm.SVC() #clf = tree.DecisionTreeRegressor(min_samples_leaf=20) #clf = Ridge(alpha=1.0) #clf = ElasticNet(alpha=0.1, l1_ratio=0.7) clf = BayesianRidge(compute_score=True) clf.fit(np.array(train_all[feats_list[target]]), np.array(train_all[target])) pred = clf.predict(np.array(test_all[feats_list[target]])) df[target] = pred df.to_csv(predloc, index=False, cols=["PIDN","Ca","P","pH","SOC","Sand"])
cross_validation.train_test_split(X_bns, Y, test_size=test_size, random_state=0) # k = int(0.5 * n_features) # print("-----------------------------------------------") # print("Perform chi2 feature selection k=", k) # print("-----------------------------------------------") # X_train, X_test = selectFeatures(X_train, X_test, y_train, k) print("-----------------------------------------------") print("SVM Classification of training set") print("-----------------------------------------------") class_weight = {0:5} print("Class weight=", class_weight) clf = BayesianRidge(compute_score=True).fit(X_train, y_train) print("Test svm.SVC score=", clf.score(X_test, y_test)) print("Train svm.SVC score=", clf.score(X_train, y_train)) print("-----------------------------------------------") print("Metrics on TEST SET") print("-----------------------------------------------") y_pred = clf.predict(X_test) print(metrics.classification_report(y_test, y_pred, target_names=label_names)) print(metrics.confusion_matrix(y_test, y_pred)) print("-----------------------------------------------") print("Metrics on TRAIN SET") print("-----------------------------------------------") y_predTrain = clf.predict(X_train)
def bayes_ridge_reg(x_data,y_data): br = BayesianRidge() br.fit(x_data,y_data) print 'br params',br.coef_,br.intercept_ adjusted_result = br.predict(x_data) return map(int,list(adjusted_result))
def prediction_BayesianRidge (X_train, Y_train, X_test, Y_test,normalize): # Print shapes of the training and testing data sets #print ("Shapes of the training and testing data sets") #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape) #Create our regression object lreg = BayesianRidge(normalize=normalize) #do a linear regression, except only on the training lreg.fit(X_train,Y_train) #print("The estimated intercept coefficient is %.2f " %lreg.intercept_) #print("The number of coefficients used was %d " % len(lreg.coef_)) # Set a DataFrame from the Facts coeff_df = DataFrame(X_train.columns) coeff_df.columns = ["Fact"] # Set a new column lining up the coefficients from the linear regression coeff_df["Coefficient"] = pd.Series(lreg.coef_) # Show #coeff_df #highest correlation between a fact and fraction votes #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) ) #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter") #Predictions on training and testing sets pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) # The mean square error #print("MSE with X_train and Y_train: %.6f" % np.mean((Y_train - pred_train) ** 2)) #print("MSE with X_test and Y_test: %.6f" %np.mean((Y_test - pred_test) ** 2)) #Explained variance score: 1 is perfect prediction #print("Variance score: %.2f" % lreg.score(X_test, Y_test)) result={} result["method"]="BayesianRidge" if normalize : result["normalize"]="Y" else: result["normalize"]="N" result["X_train_shape"]=X_train.shape result["Y_train_shape"]=Y_train.shape result["X_test_shape"]=X_test.shape result["Y_test_shape"]=Y_test.shape result["intercept"]=lreg.intercept_ result["num_coef"]=len(lreg.coef_) result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"] result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"] result["MSE_train"]=np.mean((Y_train - pred_train) ** 2) result["MSE_test"]=np.mean((Y_test - pred_test) ** 2) result["variance"]=lreg.score(X_test, Y_test) return pred_test,coeff_df,pred_train,result
# Create weights with a precision lambda_ of 4. lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise ############################################################################### # Fit the Bayesian Ridge Regression and an OLS for comparison clf = BayesianRidge(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) ############################################################################### # Plot true weights, estimated weights, histogram of the weights, and # predictions with standard deviations lw = 2 plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, color='lightgreen', linewidth=lw, label="Bayesian Ridge estimate") plt.plot(w, color='gold', linewidth=lw, label="Ground truth") plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate")
from sklearn.linear_model import BayesianRidge, LinearRegression from sklearn import svm from sklearn import metrics from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt from bokeh.plotting import figure from bokeh.io import show as bokehShow train, test = train_test_split(importation.t, test_size=0.2) spamtrain, spamtest = train_test_split(importation.valspam, test_size=0.2) clf = BayesianRidge(compute_score=True) ols = LinearRegression() clf.fit(train, spamtrain) ols.fit(train, spamtrain) expected = spamtest predicted = clf.predict(test) predicted1 = ols.predict(test) #print(spamtrain)
def main(): usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>' parser = OptionParser(usage) parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features') parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]') parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]') parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]') parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]') parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]') parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument') parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide full data HDF5, representation HDF5, and target index or filename') else: repr_hdf5_file = args[0] data_hdf5_file = args[1] target_i = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) ####################################################### # preprocessing ####################################################### # load training targets data_hdf5_in = h5py.File(data_hdf5_file, 'r') if options.target_hdf5: target_hdf5_in = h5py.File(options.target_hdf5, 'r') else: target_hdf5_in = data_hdf5_in train_y = np.array(target_hdf5_in['train_out'])[:,target_i] test_y = np.array(target_hdf5_in['test_out'])[:,target_i] # load training representations if not options.add_only: repr_hdf5_in = h5py.File(repr_hdf5_file, 'r') train_x = np.array(repr_hdf5_in['train_repr']) test_x = np.array(repr_hdf5_in['test_repr']) repr_hdf5_in.close() if options.seq_only: add_labels = [] else: # load additional features train_a = np.array(data_hdf5_in['train_add']) test_a = np.array(data_hdf5_in['test_add']) add_labels = np.array(data_hdf5_in['add_labels']) if options.regex_add: fi = filter_regex(options.regex_add, add_labels) train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi] # append additional features if options.add_only: add_i = 0 train_x, test_x = train_a, test_a else: add_i = train_x.shape[1] train_x = np.concatenate((train_x,train_a), axis=1) test_x = np.concatenate((test_x,test_a), axis=1) data_hdf5_in.close() if options.target_hdf5: target_hdf5_in.close() # balance if options.balance: train_x, train_y = balance(train_x, train_y) # sample if options.sample is not None and options.sample < train_x.shape[0]: sample_indexes = random.sample(range(train_x.shape[0]), options.sample) train_x = train_x[sample_indexes] train_y = train_y[sample_indexes] ####################################################### # model ####################################################### if options.regression: # fit model = BayesianRidge(fit_intercept=True) model.fit(train_x, train_y) # accuracy acc_out = open('%s/r2.txt' % options.out_dir, 'w') print >> acc_out, model.score(test_x, test_y) acc_out.close() test_preds = model.predict(test_x) # plot a sample of predictions versus actual plt.figure() sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3}) plt.savefig('%s/scatter.pdf' % options.out_dir) plt.close() # plot the distribution of residuals plt.figure() sns.distplot(test_y-test_preds) plt.savefig('%s/residuals.pdf' % options.out_dir) plt.close() else: # fit model = LogisticRegression(penalty='l2', C=1000) model.fit(train_x, train_y) # accuracy test_preds = model.predict_proba(test_x)[:,1].flatten() acc_out = open('%s/auc.txt' % options.out_dir, 'w') print >> acc_out, roc_auc_score(test_y, test_preds) acc_out.close() # compute and print ROC curve fpr, tpr, thresholds = roc_curve(test_y, test_preds) roc_out = open('%s/roc.txt' % options.out_dir, 'w') for i in range(len(fpr)): print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i]) roc_out.close() # compute and print precision-recall curve precision, recall, thresholds = precision_recall_curve(test_y, test_preds) prc_out = open('%s/prc.txt' % options.out_dir, 'w') for i in range(len(precision)): print >> prc_out, '%f\t%f' % (precision[i], recall[i]) prc_out.close() # save model joblib.dump(model, '%s/model.pkl' % options.out_dir) ####################################################### # analyze ####################################################### # print coefficients table coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w') for ai in range(len(add_labels)): if options.regression: coefi = model.coef_[add_i+ai] else: coefi = model.coef_[0,add_i+ai] print >> coef_out, add_labels[ai], coefi coef_out.close()
def _fill_iterative( df: pd.DataFrame, seed: int = 1, max_iter: int = 10, estimator: Any = BayesianRidge(), ): """ Gets a single imputation using IterativeImputer from sklearn. Uses BayesianRidge() from sklearn. Changed default of sample_posterior to True as we're doing multiple imputation. Clips imputed values to min-max of observed values to avoid brokenly large values. When imputation model doesn't converge nicely we otherwise end up with extreme values that are out of range of the float32 type used by model training, causing crashes. Consider this clipping a workaround until a more robust imputation strategy is in place. """ log.info("Started imputing " f"df with shape {df.shape} " f"missing share {df.isnull().mean().mean()}" f"with estimator {estimator.__class__.__name__}") # Only impute numberic cols cols_numeric = list(df.select_dtypes(include=[np.number]).columns.values) cols_not_numeric = [col for col in df.columns if col not in cols_numeric] log.info(f"imputing {len(cols_numeric)} numeric cols, " f"ignoring {len(cols_not_numeric)} non-numeric cols") for col in cols_numeric: log.debug( f"Missing share before impute {col} : {df[col].isnull().mean()}") # Get bounds so we can clip imputed values to not be outside # observed values observed_min = df[cols_numeric].min() observed_max = df[cols_numeric].max() df_imputed = df.loc[:, []].copy() for col in df: df_imputed[col] = np.nan df_imputed[cols_numeric] = IterativeImputer( random_state=seed, max_iter=max_iter, estimator=estimator).fit_transform(df[cols_numeric]) df_imputed[cols_not_numeric] = df[cols_not_numeric] # Clip imputed values to observed min-max range df_imputed[cols_numeric] = df_imputed[cols_numeric].clip(observed_min, observed_max, axis=1) log.info("Finished _fill_iterative()" f"Imputed df mising share numeric cols " f"{df[cols_numeric].isnull().mean().mean()}") for col in cols_numeric: log.debug("Missing share after impute " f"{col} : {df_imputed[col].isnull().mean()}") return df_imputed
from sklearn.linear_model import BayesianRidge import matplotlib.pyplot as plt import numpy as np X = [[6], [8], [10], [14], [18]] y = [[7], [9], [13], [17.5], [18]] bayesModel = BayesianRidge( n_iter=300, #最大迭代次数 tol=1.e-3, #停止训练的误差值大小 alpha_1=0.5, #分布的形状参数? alpha_2=0.5, #比率参数 lambda_1=0.6, #Gamma分布的形状参数 lambda_2=0.6, #Gamma比例参数 compute_score=False, #如果为真,则计算模型每一步的目标函数。 fit_intercept=True, #是否计算结局 normalize=False, #是否正则化 copy_X=True, #X被复制?被覆盖 verbose=False #详情模式 ) bayesModel.fit(X, y) xx = np.linspace(5, 20, 100) xx = xx.reshape(xx.shape[0], 1) yy = bayesModel.predict(xx) plt.plot(X, y, 'k.') plt.plot(xx, yy, 'r-') plt.show()
y_test_predictions = lr.predict(X_test) print (y_test[y_test==1] == y_test_predictions[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0] #0.875 #But, at what expense do we do this? To find out, use the following command: print (y_test_predictions == y_test).sum().astype(float) / y_test.shape[0] #0.967999 # Directly applying Bayesian ridge regression 贝叶斯岭回归 from sklearn.datasets import make_regression X, y = make_regression(1000, 10, n_informative=2, noise=20) #We can just "throw" ridge regression at the problem with a few simple steps: from sklearn.linear_model import BayesianRidge br = BayesianRidge() br.fit(X, y) print br.coef_ #array([0.3000136 , -0.33023408, 68.166673, -0.63228159, 0.07350987, #-0.90736606, 0.38851709, -0.8085291 , 0.97259451, 68.73538646]) br_alphas = BayesianRidge(alpha_1=10, lambda_1=10) br_alphas.fit(X, y) print br_alphas.coef_ #array([0.30054387, -0.33130025, 68.10432626, -0.63056712, #0.07751436, -0.90919326, 0.39020878, -0.80822013, #0.97497567, 68.67409658]) # Using boosting to learn from errors
def get_model_from_name(model_name, training_params=None, is_hp_search=False): global keras_imported # For Keras epochs = 1000 # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning': # print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy') # epochs = 100 all_model_params = { 'LogisticRegression': {}, 'RandomForestClassifier': { 'n_jobs': -2, 'n_estimators': 30 }, 'ExtraTreesClassifier': { 'n_jobs': -1 }, 'AdaBoostClassifier': {}, 'SGDClassifier': { 'n_jobs': -1 }, 'Perceptron': { 'n_jobs': -1 }, 'LinearSVC': { 'dual': False }, 'LinearRegression': { 'n_jobs': -2 }, 'RandomForestRegressor': { 'n_jobs': -2, 'n_estimators': 30 }, 'LinearSVR': { 'dual': False, 'loss': 'squared_epsilon_insensitive' }, 'ExtraTreesRegressor': { 'n_jobs': -1 }, 'MiniBatchKMeans': { 'n_clusters': 8 }, 'GradientBoostingRegressor': { 'learning_rate': 0.1, 'warm_start': True }, 'GradientBoostingClassifier': { 'learning_rate': 0.1, 'warm_start': True }, 'SGDRegressor': { 'shuffle': False }, 'PassiveAggressiveRegressor': { 'shuffle': False }, 'AdaBoostRegressor': {}, 'LGBMRegressor': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'LGBMClassifier': { 'n_estimators': 2000, 'learning_rate': 0.15, 'num_leaves': 8, 'lambda_l2': 0.001, 'histogram_pool_size': 16384 }, 'DeepLearningRegressor': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'DeepLearningClassifier': { 'epochs': epochs, 'batch_size': 50, 'verbose': 2 }, 'CatBoostRegressor': {}, 'CatBoostClassifier': {} } # if os.environ.get('is_test_suite', 0) == 'True': # all_model_params model_params = all_model_params.get(model_name, None) if model_params is None: model_params = {} if is_hp_search == True: if model_name[:12] == 'DeepLearning': model_params['epochs'] = 50 if model_name[:4] == 'LGBM': model_params['n_estimators'] = 500 if training_params is not None: print('Now using the model training_params that you passed in:') print(training_params) # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it) model_params.update(training_params) print( 'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:' ) print(model_params) model_map = { # Classifiers 'LogisticRegression': LogisticRegression(), 'RandomForestClassifier': RandomForestClassifier(), 'RidgeClassifier': RidgeClassifier(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'LinearSVC': LinearSVC(), # Regressors 'LinearRegression': LinearRegression(), 'RandomForestRegressor': RandomForestRegressor(), 'Ridge': Ridge(), 'LinearSVR': LinearSVR(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'AdaBoostRegressor': AdaBoostRegressor(), 'RANSACRegressor': RANSACRegressor(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'Lasso': Lasso(), 'ElasticNet': ElasticNet(), 'LassoLars': LassoLars(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'BayesianRidge': BayesianRidge(), 'ARDRegression': ARDRegression(), # Clustering 'MiniBatchKMeans': MiniBatchKMeans(), } try: model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001) model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001) model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( max_iter=1000, tol=0.001) model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001) model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor( max_iter=1000, tol=0.001) except TypeError: model_map['SGDClassifier'] = SGDClassifier() model_map['Perceptron'] = Perceptron() model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier( ) model_map['SGDRegressor'] = SGDRegressor() model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor() if xgb_installed: model_map['XGBClassifier'] = XGBClassifier() model_map['XGBRegressor'] = XGBRegressor() if lgb_installed: model_map['LGBMRegressor'] = LGBMRegressor() model_map['LGBMClassifier'] = LGBMClassifier() if catboost_installed: model_map['CatBoostRegressor'] = CatBoostRegressor( calc_feature_importance=True) model_map['CatBoostClassifier'] = CatBoostClassifier( calc_feature_importance=True) if model_name[:12] == 'DeepLearning': if keras_imported == False: # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead) try: os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3' os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' from tensorflow import logging logging.set_verbosity(logging.INFO) except: pass global maxnorm global Dense, Dropout global LeakyReLU, PReLU, ThresholdedReLU, ELU global Sequential global keras_load_model global regularizers, optimizers global Activation global KerasRegressor, KerasClassifier from keras.constraints import maxnorm from keras.layers import Activation, Dense, Dropout from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU from keras.models import Sequential from keras.models import load_model as keras_load_model from keras import regularizers, optimizers from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier keras_imported = True model_map['DeepLearningClassifier'] = KerasClassifier( build_fn=make_deep_learning_classifier) model_map['DeepLearningRegressor'] = KerasRegressor( build_fn=make_deep_learning_model) try: model_without_params = model_map[model_name] except KeyError as e: print( 'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize' ) raise (e) if os.environ.get('is_test_suite', False) == 'True': if 'n_jobs' in model_params: model_params['n_jobs'] = 1 model_with_params = model_without_params.set_params(**model_params) return model_with_params
def main(): parser = argparse.ArgumentParser(description="""Creates embeddings predictions.""") parser.add_argument('--train') parser.add_argument('--test') parser.add_argument('--embeddings') parser.add_argument('--cv',default=False) args = parser.parse_args() stoplist = stopwords.words("english") stoplist.extend("it's 've 's i'm he's she's you're we're they're i'll you'll he'll ".split(" ")) embeddings={} for line in codecs.open(args.embeddings,encoding="utf-8").readlines(): line = line.strip() if line: a= line.split(" ") embeddings[a[0]] = np.array([float(v) for v in a[1:]]) #cast to float, otherwise we cannot operate train_indices = [] test_indices = [] train_scores = [] train_features = [] test_features = [] # if args.learner == "logisticregression": # learner= LogisticRegression() # learner_type = "classification" # elif args.learner == "decisiontreeclassification": # learner = tree.DecisionTreeClassifier() # learner_type = "classification" # elif args.learner == "decisiontreeregression": # learner = tree.DecisionTreeRegressor() # learner_type = "regression" # elif args.learner == "bayesianridge": # learner = BayesianRidge() # learner_type = "regression" # else: learner = BayesianRidge() learner_type = "regression" le = preprocessing.LabelEncoder() for line in open(args.train).readlines(): (index, score, tweet) = line.strip().split("\t") train_indices.append(index) train_scores.append(float(score)) tweet = tweet.split(" ") train_features.append(embedfeats(tweet,embeddings,stoplist)) train_indices = np.array(train_indices) train_scores = np.array(train_scores) train_features = np.array(train_features) train_scores_int = [roundup(v) for v in train_scores] le.fit(train_scores_int) train_scores_int_transformed = le.transform(train_scores_int) if args.cv: train_cv={} cross=cross_validation.KFold(len(train_scores),n_folds=10) acc=[] for train_index, test_index in cross: #if args.debug: # print("TRAIN:", len(train_index), "TEST:", len(test_index)) X=train_features y=train_scores X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] learner.fit(X_train,y_train) y_pred= learner.predict(X_test) assert(len(y_pred)==len(test_index)) tids=train_indices[test_index] for twid,pred in zip(tids,y_pred): train_cv[twid] = pred acc.append(cosine_similarity(y_test,y_pred)[0][0]) print >>sys.stderr, "Cosine of 10-folds:", acc print >>sys.stderr, "Macro average:", np.mean(np.array(acc)), np.std(np.array(acc)) for twid in train_indices: print "{}\t{}".format(twid,train_cv[twid]) else: for line in open(args.test).readlines(): (index, score, tweet) = line.strip().split("\t") test_indices.append(index) #scores.append(score) tweet = tweet.split(" ") test_features.append(embedfeats(tweet,embeddings,stoplist)) #print np.array(train_features).shape # when features are generated, train and test if learner_type == "regression": learner.fit(train_features,train_scores) else: learner.fit(train_features,train_scores_int_transformed) predicted_scores= learner.predict(test_features) if learner_type != "regression": predicted_scores = le.inverse_transform(predicted_scores) for index, score in zip(test_indices,predicted_scores): print index+"\t"+str(score)
# Create weigts with a precision lambda_ of 4. lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise ############################################################################### # Fit the Bayesian Ridge Regression and an OLS for comparison clf = BayesianRidge(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) ############################################################################### # Plot true weights, estimated weights and histogram of the weights pl.figure(figsize=(6, 5)) pl.title("Weights of the model") pl.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate") pl.plot(w, 'g-', label="Ground truth") pl.plot(ols.coef_, 'r--', label="OLS estimate") pl.xlabel("Features") pl.ylabel("Values of the weights") pl.legend(loc="best", prop=dict(size=12))
def get_models(): models = dict() # Neural Networks models['nnet'] = MLPRegressor(activation='relu', hidden_layer_sizes=(50, 50, 50), learning_rate='adaptive', learning_rate_init=0.1, max_iter=2000, solver='sgd', alpha=0.01, random_state=0, verbose=True) # Linear Regression tuned_parameters_lr = [{'normalize': ['True', 'False']}] clf_lr = GridSearchCV(LinearRegression(), tuned_parameters_lr, scoring='neg_mean_absolute_error') models['lr'] = clf_lr # Decision Tree tuned_parameters_dtr = [{ 'min_samples_leaf': [5, 10, 50, 100], 'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'], 'splitter': ['best', 'random'], 'random_state': [0] }] clf_dtr = GridSearchCV(DecisionTreeRegressor(), tuned_parameters_dtr, scoring='neg_mean_absolute_error') models['dtr'] = clf_dtr # Random Forest tuned_parameters_rf = [{ 'min_samples_leaf': [5, 10, 50, 100], 'n_estimators': [5, 10, 50, 100], 'criterion': ['mse', 'mae'], 'random_state': [0] }] clf_rf = GridSearchCV(RandomForestRegressor(), tuned_parameters_rf, scoring='neg_mean_absolute_error') models['rf'] = clf_rf # SVR tuned_parameters_svm = [{ 'kernel': ['rbf'], 'gamma': [1e-3, 1e-4], 'C': [1, 10, 100, 1000] }] clf_svm = GridSearchCV(SVR(), tuned_parameters_svm, scoring='neg_mean_absolute_error') models['svm'] = clf_svm # Bayesian Ridge tuned_parameters_bayes = [{'n_iter': [100, 200, 300, 400, 500]}] clf_bayes = GridSearchCV(BayesianRidge(), tuned_parameters_bayes, scoring='neg_mean_absolute_error') models['bayes'] = clf_bayes # kNNeighbours tuned_parameters_knn = [{ 'n_neighbors': [1, 5, 10, 15, 20, 50], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'] }] clf_knn = GridSearchCV(KNeighborsRegressor(), tuned_parameters_knn, scoring='neg_mean_absolute_error') models['knn'] = clf_knn # Gaussian Process tuned_parameters_gp = [{ 'kernel': [WhiteKernel() + RBF() + DotProduct(), RBF() + DotProduct()], 'random_state': [0] }] clf_gp = GridSearchCV(GaussianProcessRegressor(), tuned_parameters_gp, scoring='neg_mean_absolute_error') models['gp'] = clf_gp return models