def _bayesianridge(*, train, test, x_predict=None, metrics, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, alpha_init=None, lambda_init=None, compute_score=False, fit_intercept=True, normalize=False, copy_X=True, verbose=False): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.BayesianRidge.html#sklearn.linear_model.BayesianRidge """ model = BayesianRidge(n_iter=n_iter, tol=tol, alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, alpha_init=alpha_init, lambda_init=lambda_init, compute_score=compute_score, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, verbose=verbose) model.fit(train[0], train[1]) model_name = 'Bayesian Ridge' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def Basyen_stacking(oof_lgb,oof_xgb,predictions_lgb,predictions_xgb,sub,target): res = sub.copy() # 将lgb和xgb的结果进行stacking train_stack = np.vstack([oof_lgb,oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target[trn_idx] val_data, val_y = train_stack[val_idx], target[val_idx] clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 cross_validation_loss = mean_squared_error(target, oof_stack) print(cross_validation_loss) res[1] = predictions mean = res[1].mean() print('mean:',mean) res.to_csv("./Basyen_stacking.csv",index=False,header = None) return cross_validation_loss
def fillna_knn_reg(df, base, target, fraction=1, threshold=10, n_neighbors=5): assert isinstance( base, list) or isinstance(base, np.ndarray) and isinstance(target, str) whole = [target] + base print(threshold, "\n", fraction, "\n", n_neighbors) miss = df[target].isnull() notmiss = ~miss X_target = df.loc[notmiss, whole] Y = X_target[target] X = X_target[base] X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=5) print('fitting') n_neighbors = n_neighbors clf = BayesianRidge() clf.fit(X, Y) print('predicting') print("Fit a model X_test and claculate Mean Squared Error with Y_test:") print(np.mean((Y_test - clf.predict(X_test))**2)) Z = clf.predict(df.loc[miss, base]) print('writing result to df') df.loc[miss, target] = Z
def train_BayesianRidge(self, data): train, validacion = data x_tr, y_tr = train x_val, y_val = validacion #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1])) #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1])) print('Start training BayesianRidge...') start_time = self.timer() bayesr = BayesianRidge(normalize=True, n_iter=1000) bayesr.fit(x_tr, y_tr) print("The R2 is: {}".format(bayesr.score(x_tr, y_tr))) # print("The alpha choose by CV is:{}".format(bayesr.alpha_)) self.timer(start_time) print("Making prediction on validation data") y_val = np.expm1(y_val) y_val_pred = np.expm1(bayesr.predict(x_val)) mae = mean_absolute_error(y_val, y_val_pred) print("El mean absolute error de es {}".format(mae)) print('Saving model into a pickle') try: os.mkdir('pickles') except: pass with open('pickles/bayesrCV.pkl', 'wb') as f: pickle.dump(bayesr, f) print('Making prediction and saving into a csv') y_test = bayesr.predict(self.x_test) return y_test
def modelResultMerge(predictions_lgb, predictions_xgb, train_lgb, train_xgb, target): train_stack = np.vstack([train_lgb, train_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2012) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) # clf_3 = linear_model.LinearRegression() # clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 print(mean_squared_error(target.values, oof_stack)) return predictions
def stacking_predict(oof_lgb, oof_xgb, predictions_lgb, predictions_xgb, y_train, verbose_eval=1): # stacking train_stack = np.vstack([oof_lgb, oof_xgb]).transpose() test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) stack_models = [] for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y_train)): if verbose_eval: print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx] val_data, val_y = train_stack[val_idx], y_train[val_idx] clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) stack_models.append(clf_3) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 final_score = mean_squared_error(y_train, oof_stack) if verbose_eval: print(final_score) return oof_stack, predictions, final_score, stack_models
class MixedRegressor(skl.base.BaseEstimator, skl.base.TransformerMixin): """docstring""" def __init__(self, save_path=None): super(MixedRegressor, self).__init__() self.save_path = save_path self.regressor = None self.regressorlt40 = None self.regressorgt60 = None def fit(self, X, y, sample_weight=None): X, y = check_X_y(X, y) self.regressor = BayesianRidge() self.regressorlt40 = BayesianRidge() self.regressorgt60 = BayesianRidge() self.regressor.fit(X, y) lt40 = y < 40 gt60 = y > 60 Xlt40 = X[lt40] Ylt40 = y[lt40] Xgt60 = X[gt60] Ygt60 = y[gt60] self.regressorlt40.fit(Xlt40, Ylt40) self.regressorgt60.fit(Xgt60, Ygt60) return self def predict(self, X): check_is_fitted(self, ["regressor", "regressorlt40", "regressorgt60"]) X = check_array(X) predictions = self.regressor.predict(X) lt18 = predictions < 18 gt88 = predictions > 88 if (len(predictions[lt18]) > 0): predlt18 = self.regressorlt40.predict(X[lt18]) predictions[lt18] = predlt18 if (len(predictions[gt88]) > 0): predgt88 = self.regressorgt60.predict(X[gt88]) predictions[gt88] = predgt88 return predictions def score(self, X, y, sample_weight=None): scores = -(self.predict(X) - y)**2 / len(y) score = np.sum(scores) return score def set_save_path(self, save_path): self.save_path = save_path
def make_forward_model(data_ss, RDKit_FPs): # forward model library from scikit-learn from sklearn.linear_model import BayesianRidge # xenonpy library for data splitting (cross-validation) from xenonpy.datatools import Splitter # property name will be used as a reference for calling models prop = ['E', 'H**O-LUMO gap'] # prepare indices for cross-validation data sets sp = Splitter(data_ss.shape[0], test_size=0, cv=5) # initialize output variables y_trues, y_preds = [[] for i in range(len(prop))], [[] for i in range(len(prop))] y_trues_fit, y_preds_fit = [[] for i in range(len(prop))], [[] for i in range(len(prop))] y_preds_std, y_preds_std_fit = [[] for i in range(len(prop))], [[] for i in range(len(prop))] # cross-validation test for iTr, iTe in sp.cv(): x_train = data_ss['SMILES'].iloc[iTr] x_test = data_ss['SMILES'].iloc[iTe] fps_train = RDKit_FPs.transform(x_train) fps_test = RDKit_FPs.transform(x_test) y_train = data_ss[prop].iloc[iTr] y_test = data_ss[prop].iloc[iTe] for i in range(len(prop)): mdl = BayesianRidge(compute_score=True) mdl.fit(fps_train, y_train.iloc[:, i]) prd_train, std_train = mdl.predict(fps_train, return_std=True) prd_test, std_test = mdl.predict(fps_test, return_std=True) y_trues[i].append(y_test.iloc[:, i].values) y_trues_fit[i].append(y_train.iloc[:, i].values) y_preds[i].append(prd_test) y_preds_fit[i].append(prd_train) y_preds_std[i].append(std_test) y_preds_std_fit[i].append(std_train) # write down list of property name(s) for forward models prop = ['E', 'H**O-LUMO gap'] # match with data table for convenience # calculate descriptor values for all SMILES in the data subset fps_train = RDKit_FPs.transform(data_ss['SMILES']) # initialize a dictionary for model storage mdls = {} # fill in and train the models for x in prop: mdls[x] = BayesianRidge() mdls[x].fit(fps_train, data_ss[x]) # import descriptor calculator and forward model to iQSPR prd_mdls = BayesianRidgeEstimator(descriptor=RDKit_FPs, **mdls) return prd_mdls, mdls
def do_cv_pred(train, test, files, use_cols=10, verbose=False): print("------- do preds --------") ensemble_col = [ f[1] for i, f in enumerate(files) if (i % 20) <= use_cols ] if use_cols == 2: print(ensemble_col) train_x = train[ensemble_col] test_x = test[ensemble_col] train_y = train["target"] submission = pd.DataFrame() submission["card_id"] = test["card_id"] submission["target"] = 0 outliers = (train["target"] < -30).astype(int).values split_num = 5 skf = model_selection.StratifiedKFold(n_splits=split_num, shuffle=True, random_state=4590) train_preds = [] for idx, (train_index, test_index) in enumerate(skf.split(train, outliers)): X_train, X_test = train_x.iloc[train_index], train_x.iloc[ test_index] y_train, y_test = train_y.iloc[train_index], train_y.iloc[ test_index] reg = BayesianRidge().fit(X_train, y_train) valid_set_pred = reg.predict(X_test) score = evaluator.rmse(y_test, valid_set_pred) if verbose: print(reg.coef_) print(score) y_pred = reg.predict(test_x) submission["target"] = submission["target"] + y_pred train_id = train.iloc[test_index] train_cv_prediction = pd.DataFrame() train_cv_prediction["card_id"] = train_id["card_id"] train_cv_prediction["cv_pred"] = valid_set_pred train_preds.append(train_cv_prediction) train_output = pd.concat(train_preds, axis=0) submission["target"] = submission["target"] / split_num submission.to_csv(path_const.OUTPUT_SUB, index=False) train_output["cv_pred"] = np.clip(train_output["cv_pred"], -33.219281, 18.0) train_output.to_csv(path_const.OUTPUT_OOF, index=False) df_pred = pd.merge(train[["card_id", "target"]], train_output, on="card_id") rmse_score = evaluator.rmse(df_pred["target"], df_pred["cv_pred"]) print(rmse_score)
def time_period_model_predict(train, test): train_sr, test_sr = data_preprocessing(train, test) train_time = train_sr.copy() test_time = test_sr.copy() X_train, X_test, y_train = time_model.feature_engineering( train_time, test_time) predictions_time, oof_stack_time = time_model.model_predict( X_train, X_test, y_train) #train_sr,test_sr = data_preprocessing(train,test) train_period = train_sr.copy() test_period = test_sr.copy() X_train, X_test, y_train = period_model.feature_engineering( train_period, test_period) predictions_period, oof_stack_period = period_model.model_predict( X_train, X_test, y_train) y_train = train_sr['收率'].values # 将时间点time和时间段period的结果进行stacking train_stack = np.vstack( [np.round(oof_stack_time, 3), np.round(oof_stack_period, 3)]).transpose() test_stack = np.vstack( [np.round(predictions_time, 3), np.round(predictions_period, 3)]).transpose() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(train_stack.shape[0]) predictions = np.zeros(test_stack.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, y_train)): print("fold {}".format(fold_)) trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx] val_data, val_y = train_stack[val_idx], y_train[val_idx] clf_3 = BayesianRidge() clf_3.fit(trn_data, trn_y) oof_stack[val_idx] = clf_3.predict(val_data) predictions += clf_3.predict(test_stack) / 10 print("time_period_stacking score: {:<8.8f}".format( mean_squared_error(y_train, oof_stack) / 2)) return predictions
def get_stacking(self, oof_list, prediction_list, labels): ''' :param oof_list: out-of-fold predictions :param prediction_list: test predictions :param labels: true labels of the training data set :return: stacking oof predictions of the training set and the testing set ''' train_stack = np.vstack(oof_list).transpose() # vertical stack起来 test_stack = np.vstack(prediction_list).transpose() repeats = len(oof_list) # 第一层做了多少个模型第二层就重复多少次CV 但也可以自己定啦 kfolder = RepeatedKFold(n_splits=self.n_fold, n_repeats=repeats, random_state=4590) kfold = kfolder.split(train_stack, labels) # stacking的这些模型里面也要做CV的 preds_list = list() # predictions of the testing data's labels stacking_oof = np.zeros( train_stack.shape[0] ) # predictions of the oof training data's labels for train_index, vali_index in kfold: k_x_train = train_stack[train_index] k_y_train = labels.loc[train_index] k_x_vali = train_stack[vali_index] assert self.stack_model in ['Ridge', 'Huber'] if self.stack_model == 'Ridge': stacking_model = BayesianRidge() # BayesianRidge if self.stack_model == 'Huber': stacking_model = HuberRegressor() stacking_model.fit(k_x_train, k_y_train) k_pred = stacking_model.predict(k_x_vali) stacking_oof[vali_index] = k_pred preds = stacking_model.predict(test_stack) preds_list.append(preds) fold_mae_error = mean_absolute_error(labels, stacking_oof) print(f'stacking fold mae training error is {fold_mae_error}') fold_score = 1 / (1 + fold_mae_error) print(f'fold score is {fold_score}') preds_columns = [ 'preds_{id}'.format(id=i) for i in range(self.n_fold * repeats) ] preds_df = pd.DataFrame(data=preds_list) preds_df = preds_df.T preds_df.columns = preds_columns stacking_prediction = list(preds_df.mean(axis=1)) return stacking_oof, stacking_prediction
def runBayesianRidgeRegressor(self): lm = BayesianRidge(n_iter=300, compute_score=True, fit_intercept=True, normalize=True) print("Ridge Regression") lm.fit(self.m_X_train, self.m_y_train) predictY = lm.predict(self.m_X_test) score = lm.score(self.m_X_test, self.m_y_test) predictTraingY = lm.predict(self.m_X_train) self.displayPredictPlot(predictY) self.displayResidualPlot(predictY, predictTraingY) self.dispalyModelResult(lm, predictY, score)
def update_model(nr_exp, budget): global x_unlabeled, y_unlabeled, x_labeled, y_labeled, x_test, y_test, clf, server_buffer, errorHistoryUS if clf is None: clf = BayesianRidge() clf.fit(x_labeled, y_labeled) np.save('../outputs/model_size_' + str(x_unlabeled.shape[1]) + '.npy', np.int32(asizeof.asizeof(pickle.dumps(clf)))) print(np.int32(asizeof.asizeof(pickle.dumps(clf)))) host = 'localhost' port = 33333 my_socket = -1 while my_socket == -1: my_socket = utilities.create_socket('normal', host, port) print("Connection to edge for model sharing established") my_socket.send(pickle.dumps(clf)) myfile = "../outputs/" + str(nr_exp) + "_budget_" + str(budget) + ".txt" with open(myfile, "w") as f: f.write("len_buffer_server\n") while True: time.sleep(budget) len_buffer = len(server_buffer) if len_buffer > 0: buffer_data = server_buffer[:len_buffer] del server_buffer[:len_buffer] x = [bd[1] for bd in buffer_data] idx = [bd[0] for bd in buffer_data] x = np.array(x) x = x.reshape((x.shape[0], x.shape[2])) _, std = clf.predict(x, return_std=True) most_uncertain_idx = np.argmax(std) x_labeled = np.append(x_labeled, x[most_uncertain_idx:most_uncertain_idx + 1], axis=0) y_labeled = np.append(y_labeled, y_unlabeled[idx[most_uncertain_idx]]) clf = BayesianRidge() clf.fit(x_labeled, y_labeled) p = clf.predict(x_test) errorHistoryUS.append( np.sqrt(mean_squared_error(y_test.flatten(), p.flatten()))) with open(myfile, "a") as f: f.write(str(len_buffer) + '\n') try: my_socket.send(pickle.dumps(clf)) except ConnectionResetError: break
def stacking(train_df, test_df, save=True, verbose=True): folds = KFold(n_splits=11, shuffle=True, random_state=326) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [f for f in train_df.columns if f not in ['target', 'card_id']] # k-fold for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_df[feats], train_df['target'])): train_x, train_y = train_df[feats].iloc[train_idx], train_df['target'].iloc[train_idx] valid_x, valid_y = train_df[feats].iloc[valid_idx], train_df['target'].iloc[valid_idx] # clf = LinearRegression(n_jobs=-1) clf = BayesianRidge() # clf = Ridge() # clf = Lasso() # clf = ElasticNet() # clf = SGDRegressor() # clf = HuberRegressor() clf.fit(train_x.values, train_y.values) oof_preds[valid_idx] = clf.predict(valid_x.values) sub_preds += clf.predict(test_df[feats].values) / folds.n_splits if verbose: print('Fold %2d RMSE : %.6f' % (n_fold + 1, rmse(valid_y, oof_preds[valid_idx]))) score = rmse(train_df['target'], oof_preds) if verbose: print(f'ALL RMSE: {score}') if save: out_dir = ("../data/output/stacking") if not os.path.exists(out_dir): os.makedirs(out_dir) with open(os.path.join(out_dir, f"params_{score:.5f}.txt"), "w") as fp: print(",".join(feats), file=fp) oof_df = train_df.copy().reset_index() oof_df['target'] = oof_preds oof_df[['card_id', 'target']].to_csv(os.path.join(out_dir, f"oof_{score:.5f}.csv"), index=False) submission = test_df.copy().reset_index() submission['target'] = sub_preds submission[['card_id', 'target']].to_csv(os.path.join(out_dir, f"stacking_{score:.5f}.csv"), index=False) return score
class RegressionImputer(BaseEstimator, RegressorMixin): """Custom scikit-learn estimator for imputation with Bayesian regression""" def __init__(self): pass def fit(self, X, y=None): self.regr_ = BayesianRidge() self.regr_.fit(X, y) return self def predict(self, X, y=None): try: getattr(self, "regr_") except AttributeError: raise RuntimeError("Imputer must be fitted before prediction.") # predict measures preds, stds = self.regr_.predict(X, return_std=True) return (pd.DataFrame({ "pred_mean": preds, "pred_std": stds }).apply(lambda x: np.random.normal(x.pred_mean, x.pred_std), axis=1).round())
def fnBayesianRidge(self, year, avgTemp, predictYear): feature_train, feature_test, target_train, target_test = train_test_split( year, avgTemp, test_size=0.1, random_state=42) br = BayesianRidge(compute_score=True) br.fit(feature_train[:, np.newaxis], target_train) return (br.score(feature_test[:, np.newaxis], target_test), br.predict(predictYear))
def bayes_ridge_reg(self): br = BayesianRidge() br.fit(self.x_data, self.y_data) adjusted_result = br.predict(self.x_data) print "bayes ridge params", br.coef_, br.intercept_ print "bayes ridge accuracy", get_accuracy(adjusted_result, self.y_data) return map(int, list(adjusted_result))
def bayes(self): clf = BayesianRidge(compute_score=True) clf.fit(self.X_train, self.y_train) y_pred = clf.predict(self.X_test) dict = {} set_metrics(y_pred, self.y_test, dict) return dict
def ridreg(df, test): clf = BayesianRidge() target = df['count'] train = df[['time', 'temp']] test = test2[['time', 'temp']] clf.fit(train, target) final = [] print(test.head(3)) for i, row in enumerate(test.values): y = [] for x in row: x = float(x) y.append(x) # print(x) final.append(y) predicted_probs = clf.predict(final) # print(predicted_probs.shape) # predicted_probs = pd.Series(predicted_probs) # predicted_probs = predicted_probs.map(lambda x: int(x)) keep = pd.read_csv('data/test.csv') keep = keep['datetime'] # #save to file predicted_probs = pd.DataFrame(predicted_probs) print(predicted_probs.head(3)) predicted_probs.to_csv('data/submission3.csv', index=False)
def impute_regression(data, target, n=1): """ Perform multiple imputation by drawing from posterior distribution of Bayesian ridge regression model. Parameters ---------- data : DataFrame Data to use for imputation. target : str Column to impute. n : int Number of multiple imputations to perform. Returns ------- out : DataFrame One column per imputation, indices of original rows. """ # fit regression model without nas y = data.dropna()[target] X = data.dropna().drop(target, axis=1) regr = BayesianRidge() regr.fit(X, y) # predict measures preds, stds = regr.predict(data[data[target].isnull()].drop(target, axis=1), return_std=True) # sample from distribution return samplePredictions_df(preds, stds, n=n, name="imputation")
def train(self, X_train, y_train): #X_train为矩阵,y_train为DataFrame() folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590) oof_stack = np.zeros(X_train.shape[0]) for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(X_train, y_train)): print("fold {}".format(fold_)) trn_data, trn_y = X_train[trn_idx], y_train.iloc[trn_idx].values val_data, val_y = X_train[val_idx], y_train.iloc[val_idx].values clf = BayesianRidge() clf.fit(trn_data, trn_y) oof_stack[val_idx] = clf.predict(val_data) #存储模型 joblib.dump(clf, 'stack_model' + str(fold_) + '.pkl') print("stack score:{:<8.8f}".format( mean_squared_error(y_train.values, oof_stack) / 2)) self.train_result = pd.DataFrame({ 'real': y_train.values, 'pred': oof_stack, 'error': (y_train.values - oof_stack)**2 })
class BayesianRidgeRegression(skl.base.BaseEstimator, skl.base.TransformerMixin): def __init__(self, n_iter=300, save_path=None): super(BayesianRidgeRegression, self).__init__() self.save_path = save_path self.n_iter = n_iter self.model = None def fit(self, X, y): self.model = BayesianRidge(n_iter=self.n_iter, fit_intercept=True) self.model.fit(X, y) return self def predict(self, X): X = check_array(X) prediction = self.model.predict(X) print("BayesianRidge predicted") return prediction def score(self, X, y, sample_weight=None): scores = (self.predict(X) - y)**2 / len(y) score = np.sum(scores) return -score def set_save_path(self, save_path): self.save_path = save_path
def ridreg(df,test): clf = BayesianRidge() target = df['count'] train = df[['time','temp']] test = test2[['time','temp']] clf.fit(train,target) final = [] print(test.head(3)) for i, row in enumerate(test.values): y=[] for x in row: x= float(x) y.append(x) # print(x) final.append(y) predicted_probs= clf.predict(final) # print(predicted_probs.shape) # predicted_probs = pd.Series(predicted_probs) # predicted_probs = predicted_probs.map(lambda x: int(x)) keep = pd.read_csv('data/test.csv') keep = keep['datetime'] # #save to file predicted_probs= pd.DataFrame(predicted_probs) print(predicted_probs.head(3)) predicted_probs.to_csv('data/submission3.csv',index=False)
def test_save_load(): clf = BayesianRidge(compute_score=True) X, y = get_traininig_data() clf.fit(X, y) y_hat = clf.predict(X) model = MyCustomModel(clf) conda = { "name": "test", "channels": ["defaults"], "dependencies": [{"pip": ["scipy", "sklearn"]}] } model_save_path = os.path.join(dirname(abspath(__file__)), "AzureMLModel") local_dependencies = [dirname(abspath(__file__))] save_generic_model(model, path=model_save_path, conda=conda, local_dependencies=local_dependencies) df = pd.DataFrame(data=X) df.columns = df.columns.astype(str) loaded_generic_model = load_generic_model(model_save_path) result_df = loaded_generic_model.predict(df) assert (result_df.to_numpy() == y_hat.reshape(-1, 1)).all() dfd_path = os.path.join(dirname((abspath(__file__))), "dfd") os.makedirs(dfd_path, exist_ok=True) data_save_path = os.path.join(dfd_path, "data.dataset.parquet") df.to_parquet(data_save_path, engine="pyarrow") meta_path = os.path.join(dfd_path, "_meta.yaml") with open(meta_path, "w") as fp: fp.write("type: DataFrameDirectory\nextension: {}\nformat: Parquet\ndata: data.dataset.parquet")
def test_return_std(): # Test return_std option for both Bayesian regressors def f(X): return np.dot(X, w) + b def f_noise(X, noise_mult): return f(X) + np.random.randn(X.shape[0]) * noise_mult d = 5 n_train = 50 n_test = 10 w = np.array([1.0, 0.0, 1.0, -1.0, 0.0]) b = 1.0 X = np.random.random((n_train, d)) X_test = np.random.random((n_test, d)) for decimal, noise_mult in enumerate([1, 0.1, 0.01]): y = f_noise(X, noise_mult) m1 = BayesianRidge() m1.fit(X, y) y_mean1, y_std1 = m1.predict(X_test, return_std=True) assert_array_almost_equal(y_std1, noise_mult, decimal=decimal) m2 = ARDRegression() m2.fit(X, y) y_mean2, y_std2 = m2.predict(X_test, return_std=True) assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
class BAYESIANRIDGE(): """docstring for ClassName""" def __init__(self, BayesianRidge, N): self.cores_number = int(np.ceil(multiprocessing.cpu_count()/N)) self.model = BayesianRidge( alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.001, verbose=False) print("BayesianRidge Cores: ", np.nan) def fit(self, X_train, y_train, X_test, y_test, error_type = "MAE"): error_dict = {"MSE":"rmse", "R2":{"l1","l2"}, "MAE":"mae","LOGLOSS": "multi_logloss" } error_metric = error_dict[error_type] self.model.fit(X_train, y_train ) def predict(self, X_test): prediction=self.model.predict(X_test) return(prediction)
def BayesianRidgeRegression(data, label, pred_data, pred_last): ''' 效果很差 ''' data = np.array(data) pred_data = np.array(pred_data) label = np.array(label) pred_last = np.array(pred_last) from sklearn.linear_model import BayesianRidge, LinearRegression clf = BayesianRidge(compute_score=True) clf.fit(data, label) print clf.score(data, label) pred_result = clf.predict(pred_data) print("Number of mislabeled points out of a total %d points : %d" % (pred_data.shape[0], (pred_last != pred_result).sum())) print clf.score(pred_data, pred_last) ols = LinearRegression() ols.fit(data, label) print ols.score(data, label) pred_result = ols.predict(pred_data) print("Number of mislabeled points out of a total %d points : %d" % (pred_data.shape[0], (pred_last != pred_result).sum())) print ols.score(pred_data, pred_last) return pred_result
def BayesianRidgeRegression(output, features, labels): X = features.values #.reshape(-1,1) # features y = labels.values #.reshape(-1,1) # labels X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) regressor = BayesianRidge() regressor.fit(X_train, y_train) #training the algorithm y_pred = regressor.predict(X_test) #predicting # visualiation of 10 vessels (predicted vs actual value) lr = np.round( pd.DataFrame({ 'Actual': y_test.flatten(), 'Predicted': y_pred.flatten() })) lr = lr.head(10) lr.plot(kind='bar', figsize=(10, 6)) plt.grid(which='major', linestyle='-', linewidth='0.5', color='green') plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black') plt.title( 'Actual vs Predicted {0} (10 vessels), \n BayesianRidge Regression'. format(output), fontsize=12) plt.show() #evaluation metrics lrEvaluation = evaluation('BayesianRidge Regression {0}'.format(output), y_pred, y_test) return (lrEvaluation, lr)
def Model_stack(df_train_x, df_train_y, df_test): # kernel has 'linear'/'poly'/'rbf'/'sigmoid'/'precomputed'/'callable' 如果没有给出,默认'rbf' callable 预先计算内核矩阵 svr_ = SVR(kernel='linear', degree=3, coef0=0.0, tol=0.001, C=1.0, epsilon=0.1, shrinking=True, cache_size=20) lgb_ = lgb.LGBMModel(boosting_type='gbdt', num_leaves=35, max_depth=20, max_bin=255, learning_rate=0.03, n_estimator=10, subsample_for_bin=2000, objective='regression', min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0, verbose=0, subsample_freq=1, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0, random_state=None, n_jobs=-1, silent=True) RF_model = RandomForestRegressor(n_estimators=50, max_depth=25, min_samples_split=20, min_samples_leaf=10, max_features='sqrt', oob_score=True, random_state=10) # 贝叶斯岭回归 BR_model = BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, tol=0.0000001, verbose=False) linear_model = LinearRegression() ls = Lasso(alpha=0.00375) x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.6) rg = RidgeCV(cv=5) stack = pd.DataFrame() stack_test = pd.DataFrame() ls.fit(x_train, y_train) lgb_.fit(x_train, y_train) RF_model.fit(x_train, y_train) svr_.fit(x_train, y_train) linear_model.fit(x_train, y_train) BR_model.fit(x_train, y_train) stack['rf'] = ls.predict(x_test) stack['adaboost'] = lgb_.predict(x_test) stack['gbdt'] = RF_model.predict(x_test) stack['lightgbm'] = svr_.predict(x_test) stack['linear_model'] = linear_model.predict(x_test) stack['BR'] = BR_model.predict(x_test) # print('stacking_model: ',Cross_validation(stack, y_test, rg)) rg.fit(stack, y_test) stack_test['rf'] = ls.predict(df_test) stack_test['adaboost'] = lgb_.predict(df_test) stack_test['gbdt'] = RF_model.predict(df_test) stack_test['lightgbm'] = svr_.predict(df_test) stack_test['linear_model'] = linear_model.predict(df_test) stack_test['BR'] = BR_model.predict(df_test) final_ans = rg.predict(stack_test) pd.DataFrame(final_ans).to_csv('predict_drop+3.txt', index=False, header=False)
def get_opti_temp(crop_list=["Onions", "Tomatoes"]): clft = BayesianRidge() clfo = BayesianRidge() client = MongoClient( ) #////////////////////////////////////////////////////////////////////// db = client.server_db cursor_tomato = db.corpusoptitemp.find({"Datatype": "Tomatoes"}) cursor_onion = db.corpusoptitemp.find({"Datatype": "Onions"}) xt = [] yt = [] xo = [] yo = [] for docs in cursor_tomato: item = [] item.append(docs["Temperature"]) item.append(docs["Humidity"]) xt.append(item) yt.append(docs["Losses"]) for docs in cursor_onion: item = [] item.append(docs["Temperature"]) item.append(docs["Humidity"]) xo.append(item) yo.append(docs["Losses"]) clft.fit(xt, yt) clfo.fit(xo, yo) final = [] for temp in range(9, 40): for hum in range(60, 100): a = clft.predict([[temp, hum]]) b = clfo.predict([[temp, hum]]) item = [] item.append((a[0] + b[0]) / 2) item.append(temp) item.append(hum) final.append(item) #print final final.sort(key=lambda x: x[0]) print "Store the produce at ", final[0][1], "C and at ", final[0][ 2], "% relative humidity." return final[0][1], final[0][2]
def bayesRegr(source, target): # Binarize source clf = BayesianRidge() features = source.columns[:-1] klass = source[source.columns[-1]] clf.fit(source[features], klass) preds = clf.predict(target[target.columns[:-1]]) return preds
def stacking_model(oof_list, prediction_list, labels, sample_ids): train_stack = np.vstack(oof_list).transpose() test_stack = np.vstack(prediction_list).transpose() kfolder = RepeatedKFold(n_splits=5, n_repeats=2, random_state=666) kfold = kfolder.split(train_stack, labels) preds_list = list() stacking_oof = np.zeros(train_stack.shape[0]) for train_index, vali_index in kfold: k_x_train = train_stack[train_index] k_y_train = labels.loc[train_index] k_x_vali = train_stack[vali_index] gbm = BayesianRidge(normalize=True) gbm.fit(k_x_train, k_y_train) k_pred = gbm.predict(k_x_vali) stacking_oof[vali_index] = k_pred preds = gbm.predict(test_stack) preds_list.append(preds) fold_mse_error = mean_squared_error(labels, stacking_oof) print(f'stacking fold mse error is {fold_mse_error}') mse = make_scorer(mean_squared_error) gbm = BayesianRidge() cv_mse_error = cross_val_score(gbm, train_stack, labels, scoring=mse, cv=5, n_jobs=5) cv_mse_error = np.mean(cv_mse_error) print(f'stacking cv mse error is {cv_mse_error}') preds_columns = ['preds_{id}'.format(id=i) for i in range(10)] preds_df = pd.DataFrame(data=preds_list) preds_df = preds_df.T preds_df.columns = preds_columns preds_list = list(preds_df.mean(axis=1)) sub_df = pd.DataFrame({'sample_id': sample_ids, 'rate': preds_list}) sub_df.to_csv('submittion_tree.csv', index=False, header=False)
def fit_polynomial_bayesian_skl(X, Y, degree, lambda_shape=1.e-6, lambda_invscale=1.e-6, padding=10, n=100, X_unknown=None): X_v = pol.polyvander(X, degree) clf = BayesianRidge(lambda_1=lambda_shape, lambda_2=lambda_invscale) clf.fit(X_v, Y) coeff = np.copy(clf.coef_) # there some weird intercept thing # since the Vandermonde matrix has 1 at the beginning, just add this # intercept to the first coeff coeff[0] += clf.intercept_ ret_ = [coeff] # generate the line x = np.linspace(X.min()-padding, X.max()+padding, n) x_v = pol.polyvander(x, degree) # using the provided predict method y_1 = clf.predict(x_v) # using np.dot() with coeff y_2 = np.dot(x_v, coeff) ret_.append(((x, y_1), (x, y_2))) if X_unknown is not None: xu_v = pol.polyvander(X_unknown, degree) # using the predict method yu_1 = clf.predict(xu_v) # using np.dot() with coeff yu_2 = np.dot(xu_v, coeff) ret_.append(((X_unknown, yu_1), (X_unknown, yu_2))) return ret_
def fit_model_10(self,toWrite=False): model = BayesianRidge(n_iter=5000) for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 10 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model10/model.pkl','w') pickle.dump(model,f2) f2.close()
def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features): """ Constructing a Bayesian ridge regression model from input dataframe :param x_train: features dataframe for model training :param y_train: target dataframe for model training :param x_test: features dataframe for model testing :param y_test: target dataframe for model testing :return: None """ clf = BayesianRidge() clf.fit(x_train, y_train) y_pred = clf.predict(x_test) # Mean absolute error regression loss mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred) # Mean squared error regression loss mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred) # Median absolute error regression loss median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred) # R^2 (coefficient of determination) regression score function r2 = sklearn.metrics.r2_score(y_test, y_pred) # Explained variance regression score function exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred) # Optimal ridge regression alpha value from CV ridge_alpha = clf.alpha_ with open('../trained_networks/brr_%d_data.pkl' % n_features, 'wb') as results: pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL) pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL) pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL) pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL) pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL) return
def prediction_BayesianRidge (X_train, Y_train, X_test, Y_test,normalize): # Print shapes of the training and testing data sets #print ("Shapes of the training and testing data sets") #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape) #Create our regression object lreg = BayesianRidge(normalize=normalize) #do a linear regression, except only on the training lreg.fit(X_train,Y_train) #print("The estimated intercept coefficient is %.2f " %lreg.intercept_) #print("The number of coefficients used was %d " % len(lreg.coef_)) # Set a DataFrame from the Facts coeff_df = DataFrame(X_train.columns) coeff_df.columns = ["Fact"] # Set a new column lining up the coefficients from the linear regression coeff_df["Coefficient"] = pd.Series(lreg.coef_) # Show #coeff_df #highest correlation between a fact and fraction votes #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) ) #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter") #Predictions on training and testing sets pred_train = lreg.predict(X_train) pred_test = lreg.predict(X_test) # The mean square error #print("MSE with X_train and Y_train: %.6f" % np.mean((Y_train - pred_train) ** 2)) #print("MSE with X_test and Y_test: %.6f" %np.mean((Y_test - pred_test) ** 2)) #Explained variance score: 1 is perfect prediction #print("Variance score: %.2f" % lreg.score(X_test, Y_test)) result={} result["method"]="BayesianRidge" if normalize : result["normalize"]="Y" else: result["normalize"]="N" result["X_train_shape"]=X_train.shape result["Y_train_shape"]=Y_train.shape result["X_test_shape"]=X_test.shape result["Y_test_shape"]=Y_test.shape result["intercept"]=lreg.intercept_ result["num_coef"]=len(lreg.coef_) result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"] result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"] result["MSE_train"]=np.mean((Y_train - pred_train) ** 2) result["MSE_test"]=np.mean((Y_test - pred_test) ** 2) result["variance"]=lreg.score(X_test, Y_test) return pred_test,coeff_df,pred_train,result
def sale(data): data = int(data) + 1 return log(data) dataset = pandas.read_csv("input/train2_.csv") testset = pandas.read_csv("input/test2_.csv") dataset['Sale'] = dataset['Sales'].apply(sale) labelData = dataset['Sale'].values myId = testset['Id'].values testset.drop(['Id'], inplace=True, axis=1) testData = testset.iloc[:, :].values dataset.drop(['Sales', 'Sale'], inplace=True, axis=1) dataData = dataset.iloc[:, :].values BRModel = BayesianRidge(compute_score=True) BRModel.fit(dataset.iloc[:, :].values, labelData) preds = numpy.column_stack((myId, BRModel.predict(testData))).tolist() preds = [[int(i[0])] + [exp(float(i[1])) - 1] for i in preds] print BRModel.scores_ with open("result/sub_BayesRidge.csv", "w") as output: writer = csv.writer(output, lineterminator='\n') writer.writerow(["Id", "Sales"]) writer.writerows(preds)
def main(): usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>' parser = OptionParser(usage) parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features') parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]') parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]') parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]') parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]') parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]') parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument') parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions') (options,args) = parser.parse_args() if len(args) != 3: parser.error('Must provide full data HDF5, representation HDF5, and target index or filename') else: repr_hdf5_file = args[0] data_hdf5_file = args[1] target_i = args[2] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) ####################################################### # preprocessing ####################################################### # load training targets data_hdf5_in = h5py.File(data_hdf5_file, 'r') if options.target_hdf5: target_hdf5_in = h5py.File(options.target_hdf5, 'r') else: target_hdf5_in = data_hdf5_in train_y = np.array(target_hdf5_in['train_out'])[:,target_i] test_y = np.array(target_hdf5_in['test_out'])[:,target_i] # load training representations if not options.add_only: repr_hdf5_in = h5py.File(repr_hdf5_file, 'r') train_x = np.array(repr_hdf5_in['train_repr']) test_x = np.array(repr_hdf5_in['test_repr']) repr_hdf5_in.close() if options.seq_only: add_labels = [] else: # load additional features train_a = np.array(data_hdf5_in['train_add']) test_a = np.array(data_hdf5_in['test_add']) add_labels = np.array(data_hdf5_in['add_labels']) if options.regex_add: fi = filter_regex(options.regex_add, add_labels) train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi] # append additional features if options.add_only: add_i = 0 train_x, test_x = train_a, test_a else: add_i = train_x.shape[1] train_x = np.concatenate((train_x,train_a), axis=1) test_x = np.concatenate((test_x,test_a), axis=1) data_hdf5_in.close() if options.target_hdf5: target_hdf5_in.close() # balance if options.balance: train_x, train_y = balance(train_x, train_y) # sample if options.sample is not None and options.sample < train_x.shape[0]: sample_indexes = random.sample(range(train_x.shape[0]), options.sample) train_x = train_x[sample_indexes] train_y = train_y[sample_indexes] ####################################################### # model ####################################################### if options.regression: # fit model = BayesianRidge(fit_intercept=True) model.fit(train_x, train_y) # accuracy acc_out = open('%s/r2.txt' % options.out_dir, 'w') print >> acc_out, model.score(test_x, test_y) acc_out.close() test_preds = model.predict(test_x) # plot a sample of predictions versus actual plt.figure() sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3}) plt.savefig('%s/scatter.pdf' % options.out_dir) plt.close() # plot the distribution of residuals plt.figure() sns.distplot(test_y-test_preds) plt.savefig('%s/residuals.pdf' % options.out_dir) plt.close() else: # fit model = LogisticRegression(penalty='l2', C=1000) model.fit(train_x, train_y) # accuracy test_preds = model.predict_proba(test_x)[:,1].flatten() acc_out = open('%s/auc.txt' % options.out_dir, 'w') print >> acc_out, roc_auc_score(test_y, test_preds) acc_out.close() # compute and print ROC curve fpr, tpr, thresholds = roc_curve(test_y, test_preds) roc_out = open('%s/roc.txt' % options.out_dir, 'w') for i in range(len(fpr)): print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i]) roc_out.close() # compute and print precision-recall curve precision, recall, thresholds = precision_recall_curve(test_y, test_preds) prc_out = open('%s/prc.txt' % options.out_dir, 'w') for i in range(len(precision)): print >> prc_out, '%f\t%f' % (precision[i], recall[i]) prc_out.close() # save model joblib.dump(model, '%s/model.pkl' % options.out_dir) ####################################################### # analyze ####################################################### # print coefficients table coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w') for ai in range(len(add_labels)): if options.regression: coefi = model.coef_[add_i+ai] else: coefi = model.coef_[0,add_i+ai] print >> coef_out, add_labels[ai], coefi coef_out.close()
def do_validation(data_path, steps=10): allfiles = initialize(data_path) gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5) ada = AdaBoostRegressor(n_estimators=200, learning_rate=1) etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5) rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5) kn = KNeighborsRegressor(n_neighbors=25) logit = LogisticRegression(tol=0.05) enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05) svr = SVR(kernel="linear", probability=True) ridge = Ridge(alpha=18) bridge = BayesianRidge(n_iter=500) gbm_metrics = 0.0 ada_metrics = 0.0 etree_metrics = 0.0 rf_metrics = 0.0 kn_metrics = 0.0 logit_metrics = 0.0 svr_metrics = 0.0 ridge_metrics = 0.0 bridge_metrics = 0.0 enet_metrics = 0.0 nnet_metrics = 0.0 logistic = LogisticRegression() rbm = BernoulliRBM(random_state=0, verbose=True) classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)]) for i in xrange(steps): driver = allfiles[i] df, Y = create_merged_dataset(driver) df['label'] = Y # Shuffle DF. df = df.reindex(np.random.permutation(df.index)) train = df[:100] label = train['label'] del train['label'] test = df[100:400] Y = test['label'] del test['label'] #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', # 'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', # 'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', # 'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', # 'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', # 'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', # 'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', # 'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80'] to_drop = ['driver', 'trip'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) gbm.fit(X_train, label) Y_hat = gbm.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) gbm_metrics += metrics.auc(fpr, tpr) ada.fit(X_train, label) Y_hat = ada.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ada_metrics += metrics.auc(fpr, tpr) etree.fit(X_train, label) Y_hat = etree.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) etree_metrics += metrics.auc(fpr, tpr) rf.fit(X_train, label) Y_hat = rf.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) rf_metrics += metrics.auc(fpr, tpr) kn.fit(X_train, label) Y_hat = kn.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) kn_metrics += metrics.auc(fpr, tpr) # Linear models. to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed', 'sd_avg_speed', 'mean_inst_speed', 'points'] X_train = train.drop(to_drop, 1) X_test = test.drop(to_drop, 1) logit.fit(X_train, label) Y_hat = [i[1] for i in logit.predict_proba(X_test)] fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) logit_metrics += metrics.auc(fpr, tpr) svr.fit(X_train, label) Y_hat = svr.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) svr_metrics += metrics.auc(fpr, tpr) ridge.fit(X_train, label) Y_hat = ridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) ridge_metrics += metrics.auc(fpr, tpr) bridge.fit(X_train, label) Y_hat = bridge.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) bridge_metrics += metrics.auc(fpr, tpr) enet.fit(X_train, label) Y_hat = enet.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) enet_metrics += metrics.auc(fpr, tpr) classifier.fit(X_train, label) Y_hat = classifier.predict(X_test) fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat) nnet_metrics += metrics.auc(fpr, tpr) print "" print "GBM:", gbm_metrics/steps print "AdaBoost:", ada_metrics/steps print "Extra Trees:", etree_metrics/steps print "RF:", rf_metrics/steps print "KN:", kn_metrics/steps print "" print "Logit:", logit_metrics/steps print "SVR:", svr_metrics/steps print "Ridge:", ridge_metrics/steps print "BayesianRidge:", bridge_metrics/steps print "Elastic Net:", enet_metrics/steps print "Neural Networks:", nnet_metrics/steps print ""
trainingcounts = counts[100:] testcounts = counts[:100] trainingrates = countrates[100:] testrates = countrates[:100] trainingtimes = times[100:] testtimes = times[:100] # using trainingcounts and training hists use log linear #poisson_model = sm.GLM(trainingrates, # sm.tools.tools.add_constant(traininghists), # family =sm.families.Poisson(sm.genmod.families.links.log)) #results = poisson_model.fit() #print(results.summary()) #x = results.predict(sm.tools.tools.add_constant(testhists)) clf = BayesianRidge(compute_score=True) clf.fit(traininghists,trainingrates) x = clf.predict(testhists) answer = testrates plt.plot(bins,x) plt.plot(bins,answer) plt.show()
def bayes_ridge_reg(x_data,y_data): br = BayesianRidge() br.fit(x_data,y_data) print 'br params',br.coef_,br.intercept_ adjusted_result = br.predict(x_data) return map(int,list(adjusted_result))
# print("-----------------------------------------------") # X_train, X_test = selectFeatures(X_train, X_test, y_train, k) print("-----------------------------------------------") print("SVM Classification of training set") print("-----------------------------------------------") class_weight = {0:5} print("Class weight=", class_weight) clf = BayesianRidge(compute_score=True).fit(X_train, y_train) print("Test svm.SVC score=", clf.score(X_test, y_test)) print("Train svm.SVC score=", clf.score(X_train, y_train)) print("-----------------------------------------------") print("Metrics on TEST SET") print("-----------------------------------------------") y_pred = clf.predict(X_test) print(metrics.classification_report(y_test, y_pred, target_names=label_names)) print(metrics.confusion_matrix(y_test, y_pred)) print("-----------------------------------------------") print("Metrics on TRAIN SET") print("-----------------------------------------------") y_predTrain = clf.predict(X_train) print(metrics.classification_report(y_train, y_predTrain, target_names=label_names)) print(metrics.confusion_matrix(y_train, y_predTrain)) #met.crossValidationScores(clf, X_train, y_train) # met.showRocAnalysis(X_bns, Y)
plt.plot(clf.scores_, color='navy', linewidth=lw) plt.ylabel("Score") plt.xlabel("Iterations") # Plotting some predictions for polynomial regression def f(x, noise_amount): y = np.sqrt(x) * np.sin(x) noise = np.random.normal(0, 1, len(x)) return y + noise_amount * noise degree = 10 X = np.linspace(0, 10, 100) y = f(X, noise_amount=0.1) clf_poly = BayesianRidge() clf_poly.fit(np.vander(X, degree), y) X_plot = np.linspace(0, 11, 25) y_plot = f(X_plot, noise_amount=0) y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree), return_std=True) plt.figure(figsize=(6, 5)) plt.errorbar(X_plot, y_mean, y_std, color='navy', label="Polynomial Bayesian Ridge Regression", linewidth=lw) plt.plot(X_plot, y_plot, color='gold', linewidth=lw, label="Ground Truth") plt.ylabel("Output y") plt.xlabel("Feature X") plt.legend(loc="lower left") plt.show()
X_train = np.vander(x_train, n_order + 1, increasing=True) X_test = np.vander(x_test, n_order + 1, increasing=True) # ############################################################################# # Plot the true and predicted curves with log marginal likelihood (L) reg = BayesianRidge(tol=1e-6, fit_intercept=False, compute_score=True) fig, axes = plt.subplots(1, 2, figsize=(8, 4)) for i, ax in enumerate(axes): # Bayesian ridge regression with different initial value pairs if i == 0: init = [1 / np.var(y_train), 1.] # Default values elif i == 1: init = [1., 1e-3] reg.set_params(alpha_init=init[0], lambda_init=init[1]) reg.fit(X_train, y_train) ymean, ystd = reg.predict(X_test, return_std=True) ax.plot(x_test, func(x_test), color="blue", label="sin($2\\pi x$)") ax.scatter(x_train, y_train, s=50, alpha=0.5, label="observation") ax.plot(x_test, ymean, color="red", label="predict mean") ax.fill_between(x_test, ymean-ystd, ymean+ystd, color="pink", alpha=0.5, label="predict std") ax.set_ylim(-1.3, 1.3) ax.legend() title = "$\\alpha$_init$={:.2f},\\ \\lambda$_init$={}$".format( init[0], init[1]) if i == 0: title += " (Default)" ax.set_title(title, fontsize=12) text = "$\\alpha={:.1f}$\n$\\lambda={:.3f}$\n$L={:.1f}$".format( reg.alpha_, reg.lambda_, reg.scores_[-1])
kr = KernelRidge(alpha=0.0001, coef0=1, degree=1, gamma=0.001, kernel='rbf',kernel_params=None) kr.fit(x_train_scaled, y_train) y3 = kr.predict(x_test_scaled) lasso = Lasso(alpha=1e-09) lasso.fit(x_train_scaled, y_train) y4 = lasso.predict(x_test_scaled) linear_ridge = Ridge(alpha=0.1) linear_ridge.fit(x_train_scaled,y_train) y5 = linear_ridge.predict(x_test_scaled) bayesian_ridge = BayesianRidge(alpha_1=1e-05, alpha_2=10, lambda_1=10, lambda_2=1e-05) bayesian_ridge.fit(x_train_scaled, y_train) y6 = bayesian_ridge.predict(x_test_scaled) sgd = SGDRegressor(alpha=0.1, epsilon=0.001, l1_ratio=0.2, loss='squared_loss', penalty='none', power_t=0.2) sgd.fit(x_train_scaled, y_train) y7 = sgd.predict(x_test_scaled) ########################################### print '########## TESTING ERRORS ##########' print "MAE for Linear Regression:", mean_absolute_error(y_test, y_predicted) print "MAE for SVR:", mean_absolute_error(y_test, y2) print "MAE for Kernel Ridge Regression:", mean_absolute_error(y_test, y3) print "MAE for Lasso Regression:", mean_absolute_error(y_test, y4) print "MAE for Linear Ridge Regression:", mean_absolute_error(y_test, y5) print "MAE for Bayesian Ridge Regression:", mean_absolute_error(y_test, y6) print "MAE for Stochastic Gradient Descent Regression:", mean_absolute_error(y_test, y7)
for _ in range(10): train_latent_matrix = get_latent_matrix(x,y,x) test_latent_matrix = get_latent_matrix(x,y,x_test) # Clean out rows with NaN. #mask = ~np.any(np.isnan(train_latent_matrix), axis=1) #newx = train_latent_matrix[mask] #newy = y[mask] newx = np.nan_to_num(train_latent_matrix) newy = y #last_layer = SVR(kernel='rbf', C=1e3, gamma=0.1) last_layer = BayesianRidge() last_layer.fit(newx, newy) output = last_layer.predict(test_latent_matrix) assert len(output) == 8500 runs.append(output) #for i in runs: #print len(i) fout = open('modelz.10.output', 'w') for line in zip(*runs): avg =sum(line)/len(line) if avg > 5: avg = 5.0 elif avg < 0: avg = 0.0 fout.write(str(avg)[:6]+'\n')
io.imsave("/Users/qcaudron/Desktop/charo/2_smoothed.jpg", ski.img_as_uint(surf)) # <codecell> z1 = np.mean(surf, axis=0) z2 = np.mean(surf, axis=1) #for i in range(surf.shape[1]) : # plt.plot(surf[:, i], "k") #plt.plot(z2) r = [BayesianRidge().fit(np.vander(np.arange(surf.shape[i]), 2), np.mean(surf, axis = 1-i)) for i in [0, 1]] r1 = BayesianRidge().fit(np.arange(len(z1)).reshape(len(z1),1), z1) r2 = BayesianRidge().fit(np.arange(len(z2[500:-500])).reshape(len(z2[500:-500]),1), z2[500:-500]) #plt.plot(r1.predict(np.arange(len(z1)).reshape(len(z1),1)), linewidth=5) plt.plot(r2.predict(np.arange(len(z2)).reshape(len(z2),1)), linewidth=5) plt.plot(z2, linewidth=5) #plt.axhline(b[np.argmax(h)], c="r", linewidth=3) #plt.plot(r[0].predict(np.vander(np.arange(surf.shape[0]), 2)), linewidth=3) #plt.plot(r[0].predict(np.arange(len(z1)).reshape(len(z1),1)), linewidth=3) #plt.plot(r[0].predict(np.expand_dims(np.arange(surf.shape[0]), axis=1)), linewidth=5) #plt.axhline(np.mean(z1 / r1.predict(np.arange(len(z1)).reshape(len(z1),1)))) # <codecell> lz = np.log(z2) r3 = BayesianRidge().fit(np.arange(len(lz[500:-500])).reshape(len(lz[500:-500]),1), lz[500:-500]) plt.plot(np.exp(lz)) plt.plot(np.exp(r3.predict(np.arange(len(lz)).reshape(len(lz),1))))
def main(): parser = argparse.ArgumentParser(description="""Creates embeddings predictions.""") parser.add_argument('--train') parser.add_argument('--test') parser.add_argument('--embeddings') parser.add_argument('--cv',default=False) args = parser.parse_args() stoplist = stopwords.words("english") stoplist.extend("it's 've 's i'm he's she's you're we're they're i'll you'll he'll ".split(" ")) embeddings={} for line in codecs.open(args.embeddings,encoding="utf-8").readlines(): line = line.strip() if line: a= line.split(" ") embeddings[a[0]] = np.array([float(v) for v in a[1:]]) #cast to float, otherwise we cannot operate train_indices = [] test_indices = [] train_scores = [] train_features = [] test_features = [] # if args.learner == "logisticregression": # learner= LogisticRegression() # learner_type = "classification" # elif args.learner == "decisiontreeclassification": # learner = tree.DecisionTreeClassifier() # learner_type = "classification" # elif args.learner == "decisiontreeregression": # learner = tree.DecisionTreeRegressor() # learner_type = "regression" # elif args.learner == "bayesianridge": # learner = BayesianRidge() # learner_type = "regression" # else: learner = BayesianRidge() learner_type = "regression" le = preprocessing.LabelEncoder() for line in open(args.train).readlines(): (index, score, tweet) = line.strip().split("\t") train_indices.append(index) train_scores.append(float(score)) tweet = tweet.split(" ") train_features.append(embedfeats(tweet,embeddings,stoplist)) train_indices = np.array(train_indices) train_scores = np.array(train_scores) train_features = np.array(train_features) train_scores_int = [roundup(v) for v in train_scores] le.fit(train_scores_int) train_scores_int_transformed = le.transform(train_scores_int) if args.cv: train_cv={} cross=cross_validation.KFold(len(train_scores),n_folds=10) acc=[] for train_index, test_index in cross: #if args.debug: # print("TRAIN:", len(train_index), "TEST:", len(test_index)) X=train_features y=train_scores X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] learner.fit(X_train,y_train) y_pred= learner.predict(X_test) assert(len(y_pred)==len(test_index)) tids=train_indices[test_index] for twid,pred in zip(tids,y_pred): train_cv[twid] = pred acc.append(cosine_similarity(y_test,y_pred)[0][0]) print >>sys.stderr, "Cosine of 10-folds:", acc print >>sys.stderr, "Macro average:", np.mean(np.array(acc)), np.std(np.array(acc)) for twid in train_indices: print "{}\t{}".format(twid,train_cv[twid]) else: for line in open(args.test).readlines(): (index, score, tweet) = line.strip().split("\t") test_indices.append(index) #scores.append(score) tweet = tweet.split(" ") test_features.append(embedfeats(tweet,embeddings,stoplist)) #print np.array(train_features).shape # when features are generated, train and test if learner_type == "regression": learner.fit(train_features,train_scores) else: learner.fit(train_features,train_scores_int_transformed) predicted_scores= learner.predict(test_features) if learner_type != "regression": predicted_scores = le.inverse_transform(predicted_scores) for index, score in zip(test_indices,predicted_scores): print index+"\t"+str(score)
import time import sys import numpy import vector from sklearn.linear_model import BayesianRidge, LinearRegression from sklearn.cross_validation import train_test_split usage = "filename features_file labels_file output_file" if __name__ == "__main__": if (len(sys.argv)!=5): print usage else: file_x = sys.argv[1] file_y = sys.argv[2] file_out = sys.argv[3] split_seed = sys.argv[4] X = numpy.genfromtxt(file_x, delimiter=' ') y = numpy.genfromtxt(file_y, delimiter=' ') # Split the data into training/testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=split_seed) # Bayesian Ridge Regression clf = BayesianRidge(compute_score=True) clf.fit(X, y) y_predict=clf.predict(X_test) numpy.savetxt(file_out, y_predict)
def nickmain1(): train_all = pd.read_csv(trainloc) target_all = pd.read_csv(trainloc) test_all = pd.read_csv(testloc) targets = ['Ca','P','pH','SOC','Sand'] train_cols_to_remove = ['PIDN']+targets train_all["Depth"] = train_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10]) test_all["Depth"] = test_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10]) common_features = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI'] feats_list = {} colnames_nums = [] colnames = train_all.ix[:,'m7497.96':'m599.76'].columns.values for x in colnames: match = re.search(r'(?<=m)[0-9]*',x) if match: colnames_nums.append(int(match.group())) print len(colnames) print len(colnames_nums) print len(train_all.ix[0,'m7497.96':'m599.76'].values) for target in targets: selector = SelectKBest(f_regression, k=200) selector.fit_transform(train_all.ix[:,'m7497.96':'m599.76'], train_all[target]) selected = selector.get_support() feats = [col for (col,sel) in zip(list(train_all.ix[:,'m7497.96':'m599.76'].columns.values), selected) if sel] feats_list[target] = feats+common_features #pickTest = ['PIDN', 'BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI','Depth']#ORIGINAL10 ids = np.genfromtxt(testloc, dtype=str, skip_header=1, delimiter=',', usecols=0) df = pd.DataFrame({"PIDN": ids, "Ca": test_all['PIDN'], "P": test_all['PIDN'], "pH": test_all['PIDN'], "SOC": test_all['PIDN'], "Sand": test_all['PIDN']}) cv = cross_validation.KFold(len(train_all), n_folds=10, indices=False) subresults = {} results = [] if issub == False: for train_sub, test_sub in cv: for target in targets: #clf = ensemble.GradientBoostingRegressor(n_estimators=6) #clf = RandomForestRegressor(n_estimators = 40) #clf = linear_model.Lasso(alpha=0.08) #clf = svm.SVC() #clf = tree.DecisionTreeRegressor(min_samples_leaf=20) #clf = Ridge(alpha=1.0) #clf = ElasticNet(alpha=0.1, l1_ratio=0.7) clf = BayesianRidge(compute_score=True) clf.fit(np.array(train_all[feats_list[target]])[train_sub], np.array(train_all[target])[train_sub]) pred = clf.predict(np.array(train_all[feats_list[target]])[test_sub]) subresults[target] = ev.rmse(np.array(train_all[target])[test_sub],np.array(pred)) #df[target] = pred subtotal = 0 for x in subresults: subtotal = subtotal + subresults[x] print ("average for the run is ", subtotal/len(targets)) results.append(subtotal/len(targets)) print "Results: " + str( np.array(results).mean() ) else: for target in targets: #clf = ensemble.GradientBoostingRegressor(n_estimators=6) #clf = RandomForestRegressor(n_estimators = 20) #clf = linear_model.Lasso(alpha=0.08) #clf = svm.SVC() #clf = tree.DecisionTreeRegressor(min_samples_leaf=20) #clf = Ridge(alpha=1.0) #clf = ElasticNet(alpha=0.1, l1_ratio=0.7) clf = BayesianRidge(compute_score=True) clf.fit(np.array(train_all[feats_list[target]]), np.array(train_all[target])) pred = clf.predict(np.array(test_all[feats_list[target]])) df[target] = pred df.to_csv(predloc, index=False, cols=["PIDN","Ca","P","pH","SOC","Sand"])
# Linear Regression print 'linear' lr = LinearRegression() #lr.fit(x[:, np.newaxis], y) #lr_sts_scores = lr.predict(xt[:, np.newaxis]) lr.fit(x, y) lr_sts_scores = lr.predict(xt) # Baysian Ridge Regression print 'baysian ridge' br = BayesianRidge(compute_score=True) #br.fit(x[:, np.newaxis], y) #br_sts_scores = br.predict(xt[:, np.newaxis]) br.fit(x, y) br_sts_scores = br.predict(xt) # Elastic Net print 'elastic net' enr = ElasticNet() #enr.fit(x[:, np.newaxis], y) #enr_sts_scores = enr.predict(xt[:, np.newaxis]) enr.fit(x, y) enr_sts_scores = enr.predict(xt) # Passive Aggressive Regression print 'passive aggressive' par = PassiveAggressiveRegressor() par.fit(x, y)
def main(): usage = "usage: %prog [options] <model_file>" parser = OptionParser(usage) parser.add_option( "-c", dest="center_dist", default=10, type="int", help="Distance between the motifs and sequence center [Default: %default]", ) parser.add_option( "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]" ) parser.add_option( "-g", dest="cuda", default=False, action="store_true", help="Run on the GPGPU [Default: %default]" ) parser.add_option("-l", dest="seq_length", default=600, type="int", help="Sequence length [Default: %default]") parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]") parser.add_option( "-t", dest="targets", default="0", help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide Basset model file") else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(",")] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = "" if options.cuda: cuda_str = "-cuda" ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] # num_filters = len(filter_consensus) num_filters = 20 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length / 2 - options.center_dist - filter_len right_i = options.seq_length / 2 + options.center_dist ns_1hot = np.zeros((4, options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:, left_i : left_i + filter_len] = filter_consensus[i] motifs_seq[:, right_i : right_i + filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = "%s/motif_seqs.h5" % options.out_dir h5f = h5py.File(seqs_file, "w") h5f.create_dataset("test_in", data=seqs_1hot) h5f.close() # predict scores scores_file = "%s/motif_seqs_scores.h5" % options.out_dir torch_cmd = "th basset_place2_predict.lua %s %s %s %s" % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, "r") motif_seq_scores = np.array(hdf5_in["scores"]) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0], 2 * num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi, i] += 1 X[xi, num_filters + j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:, ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:, ti]) # print filter coefficients coef_out = open("%s/coefs_t%d.txt" % (options.out_dir, ti), "w") for i in range(num_filters): print >> coef_out, "%3d %6.2f" % (i, model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters, num_filters)) table_out = open("%s/table_t%d.txt" % (options.out_dir, ti), "w") si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i, j] = motif_seq_scores[si, ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si, ti], motif_seq_preds[si], filter_interaction[i, j]) print >> table_out, "%3d %3d %6.3f %6.3f %6.3f" % cols si += 1 table_out.close() # plot heat map plt.figure() sns.heatmap(filter_interaction) plt.savefig("%s/heat_t%d.pdf" % (options.out_dir, ti))
### Imputing DYAR train = df[(df.DYAR.isnull() ==False) & (df.pct_team_tgts.isnull() == False)] train.reset_index(inplace=True, drop=True) test = df[(df.DYAR.isnull() == True) & (df.pct_team_tgts.isnull() == False)] test.reset_index(inplace= True, drop=True) features = ['targets', 'receptions', 'rec_tds', 'start_ratio', 'pct_team_tgts', 'pct_team_receptions', 'pct_team_touchdowns', 'rec_yards', 'dpi_yards', 'fumbles', 'first_down_ctchs', 'pct_of_team_passyards'] X = scale(train[features]) y = train.DYAR # Our best model for predicting DYAR was a Bayesian Ridge Regressor br = BayesianRidge() br.fit(X,y) dyar_predictions = pd.DataFrame(br.predict(scale(test[features])), columns = ['DYAR_predicts']) test = test.join(dyar_predictions) test['DYAR'] = test['DYAR_predicts'] test.drop('DYAR_predicts', inplace=True, axis=1) frames = [train,test] df = pd.concat(frames, axis=0, ignore_index=True) ### Imputing EYds train = df[(df.EYds.isnull() ==False) & (df.pct_team_tgts.isnull() == False)] train.reset_index(inplace=True, drop=True) test = df[(df.EYds.isnull() == True) & (df.pct_team_tgts.isnull() == False)] test.reset_index(inplace= True, drop=True) # A Bayesian Ridge was also our best predictor for EYds. In general, we're able to most confidently predict EYds.
def main(): usage = 'usage: %prog [options] <model_file>' parser = OptionParser(usage) parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]') parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]') parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]') parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]') parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]') parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide Basset model file') else: model_file = args[0] out_targets = [int(ti) for ti in options.targets.split(',')] if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) random.seed(1) # torch options cuda_str = '' if options.cuda: cuda_str = '-cuda' ################################################################# # place filter consensus motifs ################################################################# # determine filter consensus motifs filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str) seqs_1hot = [] num_filters = len(filter_consensus) # num_filters = 40 filter_len = filter_consensus[0].shape[1] # position the motifs left_i = options.seq_length/2 - options.center_dist - filter_len right_i = options.seq_length/2 + options.center_dist ns_1hot = np.zeros((4,options.seq_length)) + 0.25 # ns_1hot = np.zeros((4,options.seq_length)) # for i in range(options.seq_length): # nt_i = random.randint(0,3) # ns_1hot[nt_i,i] = 1 for i in range(num_filters): for j in range(num_filters): # copy the sequence of N's motifs_seq = np.copy(ns_1hot) # write them into the one hot coding motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i] motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j] # save seqs_1hot.append(motifs_seq) # make a full array seqs_1hot = np.array(seqs_1hot) # reshape for spatial seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length)) ################################################################# # place filter consensus motifs ################################################################# # save to HDF5 seqs_file = '%s/motif_seqs.h5' % options.out_dir h5f = h5py.File(seqs_file, 'w') h5f.create_dataset('test_in', data=seqs_1hot) h5f.close() # predict scores scores_file = '%s/motif_seqs_scores.h5' % options.out_dir torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file) subprocess.call(torch_cmd, shell=True) # load in scores hdf5_in = h5py.File(scores_file, 'r') motif_seq_scores = np.array(hdf5_in['scores']) hdf5_in.close() ################################################################# # analyze ################################################################# for ti in out_targets: ################################################################# # compute pairwise expectations ################################################################# # X = np.zeros((motif_seq_scores.shape[0],num_filters)) # xi = 0 # for i in range(num_filters): # for j in range(num_filters): # X[xi,i] += 1 # X[xi,j] += 1 # xi += 1 X = np.zeros((motif_seq_scores.shape[0],2*num_filters)) xi = 0 for i in range(num_filters): for j in range(num_filters): X[xi,i] += 1 X[xi,num_filters+j] += 1 xi += 1 # fit model model = BayesianRidge() model.fit(X, motif_seq_scores[:,ti]) # predict pairwise expectations motif_seq_preds = model.predict(X) print model.score(X, motif_seq_scores[:,ti]) # print filter coefficients coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w') for i in range(num_filters): print >> coef_out, '%3d %6.2f' % (i,model.coef_[i]) coef_out.close() ################################################################# # normalize pairwise predictions ################################################################# filter_interaction = np.zeros((num_filters,num_filters)) table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w') si = 0 for i in range(num_filters): for j in range(num_filters): filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si] cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j]) print >> table_out, '%3d %3d %6.3f %6.3f %6.3f' % cols si += 1 table_out.close() scores_abs = abs(filter_interaction.flatten()) max_score = stats.quantile(scores_abs, .999) print 'Limiting scores to +-%f' % max_score filter_interaction_max = np.zeros((num_filters, num_filters)) for i in range(num_filters): for j in range(num_filters): filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score]) filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score]) # plot heat map plt.figure() sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False) plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
KNNmse = Model_two.predict(X_crosstest_scl) print("KNN_RMSE:",np.sqrt(mean_squared_error(y_crosstest,KNNmse))) print(datetime.now() - start) start = datetime.now() from sklearn import ensemble Model_three = ensemble.RandomForestRegressor(n_estimators = 500,verbose=1,n_jobs=-1,random_state = 120,max_depth=16) Model_three.fit(X_crosstrain_svd,y_crosstrain) RFmse = Model_three.predict(X_crosstest_svd) print("RandomForest_RMSE:",np.sqrt(mean_squared_error(y_crosstest,RFmse))) print(datetime.now() - start) start = datetime.now() from sklearn.linear_model import BayesianRidge BR = BayesianRidge(n_iter=500,tol= 0.001,normalize=True).fit(X_crosstrain_scl,y_crosstrain) pred_BR = BR.predict(X_crosstest_scl) print("BayesinRidge_RMSE:",np.sqrt(mean_squared_error(y_crosstest,pred_BR))) print(datetime.now() - start) start = datetime.now() from sklearn.linear_model import LinearRegression LR = LinearRegression(fit_intercept = True,normalize = True,n_jobs=-1).fit(X_crosstrain_svd,y_crosstrain) pred_LR = LR.predict(X_crosstest_svd) print("LinearRegression_RMSE:",np.sqrt(mean_squared_error(y_crosstest,pred_LR))) print(datetime.now() - start) #decision tree along with Adaboost start = datetime.now() from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import AdaBoostRegressor