def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000): TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length) prediction_model = GradientBoostingClassifier( loss='deviance', learning_rate=0.1, n_estimators=30, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=5, ) x_train, y_train = clean_data(TRAIN_FILE) x_test, y_test = clean_data(TEST_FILE) with Timer('fit model'): prediction_model.fit(x_train, y_train) with Timer('evaluate model'): y_prediction_train = prediction_model.predict_proba(x_train) y_prediction_test = prediction_model.predict_proba(x_test) loss_train = log_loss(y_train, y_prediction_train) loss_test = log_loss(y_test, y_prediction_test) print 'loss_train: %s' % loss_train print 'loss_test: %s' % loss_test
def ensembleGBM(derived_data_path, X_train, Y_train, X_test, seed=60): random.seed(seed) GBM1 = GradientBoostingClassifier(n_estimators = 1500, learning_rate = 0.008, min_samples_leaf = 5, max_features=0.2, max_depth=7) GBM2 = GradientBoostingClassifier(n_estimators = 1700, learning_rate = 0.007, min_samples_leaf = 5, max_features=0.2, max_depth=7) GBM3 = GradientBoostingClassifier(n_estimators = 1600, learning_rate = 0.0075, min_samples_leaf = 5, max_features=0.2, max_depth=7) GBM4 = GradientBoostingClassifier(n_estimators = 1650, learning_rate = 0.007, min_samples_leaf = 5, max_features=0.2, max_depth=8) GBM5 = GradientBoostingClassifier(n_estimators = 1750, learning_rate = 0.00725, min_samples_leaf = 6, max_features=0.2, max_depth=7) GBM6 = GradientBoostingClassifier(n_estimators = 1550, learning_rate = 0.00775, min_samples_leaf = 4, max_features=0.2, max_depth=7) GBM7 = GradientBoostingClassifier(n_estimators = 1850, learning_rate = 0.00725, min_samples_leaf = 5, max_features=0.2, max_depth=6) print "Running Model 1" GBM1.fit(X_train, Y_train) print "Running Model 2" GBM2.fit(X_train, Y_train) print "Running Model 3" GBM3.fit(X_train, Y_train) print "Running Model 4" GBM4.fit(X_train, Y_train) print "Running Model 5" GBM5.fit(X_train, Y_train) print "Running Model 6" GBM6.fit(X_train, Y_train) print "Running Model 7" GBM7.fit(X_train, Y_train) GBMClassifiers = [GBM1, GBM2, GBM3, GBM4, GBM5, GBM6, GBM7] saveObject(derived_data_path, 'GBM_classifiers.obj', GBMClassifiers) combine = float(1)/7*(GBM1.predict_proba(X_test)[:,1] + GBM2.predict_proba(X_test)[:,1] + GBM3.predict_proba(X_test)[:,1] +GBM4.predict_proba(X_test)[:,1] +GBM5.predict_proba(X_test)[:,1] + GBM6.predict_proba(X_test)[:,1] + GBM7.predict_proba(X_test)[:,1]) return combine
def predict(fea, df, t, t9): Un = df.columns == 'Blank' for f in Fea: ''' try: df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')] print(1) except: pass ''' Un = Un | (df.columns == f) Un = Un | (df.columns == (f+'_x')) Un = Un | (df.columns == (f+'_y')) Un = Un & (df.columns != 'New_y') clf = GradientBoostingClassifier() y = df[t].label X = df[t].ix[:,Un] X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1) clf.fit(X_train, y_train) re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])) print re re = 'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1])) print re print(X.columns) print(clf.feature_importances_) return Un, clf
def ensembleGBMTest(derived_data_path, X_train, Y_train, X_test, Y_test): random.seed(60) GBM1 = GradientBoostingClassifier(n_estimators = 1500, learning_rate = 0.008, min_samples_leaf = 5, max_features=0.2, max_depth=7) GBM2 = GradientBoostingClassifier(n_estimators = 1700, learning_rate = 0.007, min_samples_leaf = 5, max_features=0.2, max_depth=7) GBM3 = GradientBoostingClassifier(n_estimators = 1600, learning_rate = 0.0075, min_samples_leaf = 5, max_features=0.2, max_depth=7) GBM4 = GradientBoostingClassifier(n_estimators = 1650, learning_rate = 0.007, min_samples_leaf = 5, max_features=0.2, max_depth=8) GBM5 = GradientBoostingClassifier(n_estimators = 1750, learning_rate = 0.00725, min_samples_leaf = 6, max_features=0.2, max_depth=7) GBM6 = GradientBoostingClassifier(n_estimators = 1550, learning_rate = 0.00775, min_samples_leaf = 4, max_features=0.2, max_depth=7) GBM7 = GradientBoostingClassifier(n_estimators = 1850, learning_rate = 0.00725, min_samples_leaf = 5, max_features=0.2, max_depth=6) GBM1.fit(X_train, Y_train) GBM2.fit(X_train, Y_train) GBM3.fit(X_train, Y_train) GBM4.fit(X_train, Y_train) GBM5.fit(X_train, Y_train) GBM6.fit(X_train, Y_train) GBM7.fit(X_train, Y_train) print "GBM1: %f" % (gini(GBM1, X_test, Y_test)) print "GBM2: %f" % (gini(GBM2, X_test, Y_test)) print "GBM3: %f" % (gini(GBM3, X_test, Y_test)) print "GBM4: %f" % (gini(GBM4, X_test, Y_test)) print "GBM5: %f" % (gini(GBM5, X_test, Y_test)) print "GBM6: %f" % (gini(GBM6, X_test, Y_test)) print "GBM7: %f" % (gini(GBM7, X_test, Y_test)) #now combine! combine = GBM1.predict_proba(X_test)[:,1] + GBM2.predict_proba(X_test)[:,1] + GBM3.predict_proba(X_test)[:,1] +GBM4.predict_proba(X_test)[:,1] +GBM5.predict_proba(X_test)[:,1] combine = combine + GBM6.predict_proba(X_test)[:,1] + GBM7.predict_proba(X_test)[:,1] print "With our powers combined: %f" % (giniNoEstimator(Y_test, combine)) GBMClassifiers = [GBM1, GBM2, GBM3, GBM4, GBM5, GBM6, GBM7] saveObject(derived_data_path, 'GBM_classifiers.obj', GBMClassifiers)
class TestGradientBoostingClassifierConverter(TestCase): def setUp(self): np.random.seed(1) self.est = GradientBoostingClassifier(max_depth=2, n_estimators=10) self.est.fit([[0, 0], [0, 1], [1, 0], [1, 1]], [0, 1, 1, 1]) self.ctx = TransformationContext( { Schema.INPUT: [IntegerNumericFeature("x1"), StringCategoricalFeature("x2", ["zero", "one"])], Schema.MODEL: [IntegerNumericFeature("x1"), StringCategoricalFeature("x2", ["zero", "one"])], Schema.DERIVED: [], Schema.OUTPUT: [IntegerCategoricalFeature("output", [0, 1])], } ) self.converter = GradientBoostingConverter(estimator=self.est, context=self.ctx) def test_transform(self): p = self.converter.pmml() mm = p.MiningModel[0] assert mm.MiningSchema is not None, "Missing mining schema" assert len(mm.MiningSchema.MiningField) == 2, "Wrong number of mining fields" assert mm.Segmentation is not None, "Missing segmentation root" def test_transform_with_verification(self): p = self.converter.pmml( [ {"x1": 0, "x2": "zero", "output": self.est.predict_proba([[0, 0]])[0, 1]}, {"x1": 0, "x2": "one", "output": self.est.predict_proba([[0, 1]])[0, 1]}, {"x1": 1, "x2": "zero", "output": self.est.predict_proba([[1, 0]])[0, 1]}, {"x1": 1, "x2": "one", "output": self.est.predict_proba([[1, 1]])[0, 1]}, ] ) mm = p.MiningModel[0] assert mm.MiningSchema is not None, "Missing mining schema" assert len(mm.MiningSchema.MiningField) == 2, "Wrong number of mining fields" assert mm.Segmentation is not None, "Missing segmentation root"
def GB_Classifier(X_train, X_cv, X_test, Y_train,Y_cv,Y_test, Actual_DS): print("***************Starting Gradient Boosting***************") t0 = time() clf = GradientBoostingClassifier(n_estimators=500,learning_rate=0.01) clf.fit(X_train, Y_train) preds = clf.predict(X_cv) score = clf.score(X_cv,Y_cv) print("Gradient Boosting - {0:.2f}%".format(100 * score)) Summary = pd.crosstab(label_enc.inverse_transform(Y_cv), label_enc.inverse_transform(preds), rownames=['actual'], colnames=['preds']) Summary['pct'] = (Summary.divide(Summary.sum(axis=1), axis=1)).max(axis=1)*100 print(Summary) #Check with log loss function epsilon = 1e-15 #ll_output = log_loss_func(Y_cv, preds, epsilon) preds2 = clf.predict_proba(X_cv) ll_output2= log_loss(Y_cv, preds2, eps=1e-15, normalize=True) print(ll_output2) print("done in %0.3fs" % (time() - t0)) preds3 = clf.predict_proba(X_test) #preds4 = clf.predict_proba((Actual_DS.ix[:,'feat_1':])) preds4 = clf.predict_proba(Actual_DS) print("***************Ending Gradient Boosting***************") return pd.DataFrame(preds2),pd.DataFrame(preds3),pd.DataFrame(preds4)
def predict(fea1,fea2, df, t, t9): n = 0 weight = [0.73,0.27] tave = np.zeros(len(df[t9])) y = df[t].label X_1 = df[t] df9 = df[t9] for fea in [fea1,fea2]: Un = df.columns == 'Blank' for f in fea: Un = Un | (df.columns == f) Un = Un | (df.columns == (f+'_x')) Un = Un | (df.columns == (f+'_y')) Un = Un & (df.columns != 'quarterly_attrition_rate_y') clf = GradientBoostingClassifier() X = X_1.ix[:,Un] X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1) min_max_scaler = preprocessing.MinMaxScaler() clf.fit(min_max_scaler.fit_transform(X_train), y_train) re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(min_max_scaler.transform(X_test))[:,1])) print re t = clf.predict_proba(min_max_scaler.fit_transform(df9.ix[:,Un]))[:,1] re = 'September AUC: \t' + str(roc_auc_score(df9.label,t)) print re tave = t * weight[n] + tave n += 1 print '-' * 30 print(weight) print 'Total AUC' re = 'September AUC: \t' + str(roc_auc_score(df9.label,tave)) print re return Un, clf
def gbdt_solver(train_data, train_label, validation, test, unlabel, dimreduce=decomposition.undo): """ """ # train_data = train_data[:100,:] # train_label = train_label[:100] logging.info("begin to train the gbdt classifier") new_train_data, new_val, new_test, new_unlabel = dimreduce(train_data, train_label, validation, test, unlabel) logging.info("finished feature extracting") """ gb = GradientBoostingClassifier () params_gbdt = {"n_estimators":[100,200,500,1000], "learning_rate":[0.02,0.03,0.05,0.1], "max_depth":[3,5,7,9], "random_state":[1000000007]}""" # rand_search_result = GridSearchCV (gb, param_grid = params_gbdt , n_jobs = 3 , cv = 3, scoring = 'roc_auc') # rand_search_result = RandomizedSearchCV (gb, param_distributions = params_gbdt, n_jobs = 3, cv = 3, n_iter = 100, scoring = 'roc_auc') # rand_search_result.fit (new_train_data , train_label) # params = tools.report (rand_search_result.grid_scores_) params = { "n_estimators": 600, "learning_rate": 0.03, "random_state": 1000000007, "max_depth": 2, "warm_start": True, } gb = GradientBoostingClassifier(**params) gb.fit(new_train_data, train_label) joblib.dump(gb, ROOT + "/result/gbdt.pkl") evaluate.get_auc(gb.predict_proba(new_val)[:, 1]) return gb.predict_proba(new_test)[:, 1]
def main(): train_f = pd.read_csv(train_path, header=0, parse_dates=['Dates']) print train_f.dtypes X, Y = get_feature(train_f, "training_set") ### TRAINING clf = GradientBoostingClassifier(n_estimators=50) # clf = RandomForestClassifier(n_estimators=2) # clf = LogisticRegression(n_jobs=4) X, Y = shuffle_XY(X, Y) data_len = len(X) train_len = data_len * 95 / 100 val_len = data_len - train_len X_train = X[:train_len] X_val = X[train_len:] Y_train = Y[:train_len] Y_val = Y[train_len:] clf = clf.fit(X_train, Y_train) print "Training done" val_acc = clf.score(X_val, Y_val) print "Val acc:", val_acc val_pred = clf.predict_proba(X_val) # print max(Y_val), min(Y_val) # print Y_val, Y_val + 1 val_log = 0.0 cnt = 0 for y in Y_val: val_log += math.log(val_pred[cnt, y]+0.0000001) cnt += 1 val_log = - val_log / len(Y_val) print "Val log loss:", val_log # print "Val loss:", log_loss(Y_val+1, val_pred) # Note the +1 here! """ # scores = cross_val_score(clf, X, Y) # print "Cross val acc:", scores.mean() """ ### Testing test_f = pd.read_csv(test_path, header=0, parse_dates=['Dates']) # print test_f.dtypes X_test, _ = get_feature(test_f, "test_set") Y_test = clf.predict_proba(X_test) ### Write results # write_results(Y_test) write_results_prob(Y_test)
class MyGradientBoost(MyClassifier): def __init__(self, params=dict()): self._params = params self._gb = GradientBoostingClassifier(**(self._params)) def update_params(self, updates): self._params.update(updates) self._gb = GradientBoostingClassifier(**(self._params)) def fit(self, Xtrain, ytrain): self._gb.fit(Xtrain, ytrain) # def predict(self, Xtest, option = None): # return self._gb.predict(Xtest) def predict_proba(self, Xtest, option = None): return self._gb.predict_proba(Xtest)[:, 1] def predict_proba_multi(self, Xtest, option = None): return self._gb.predict_proba(Xtest) def plt_feature_importance(self, fname_list, f_range = list()): importances = self._gb.feature_importances_ std = np.std([tree[0].feature_importances_ for tree in self._gb.estimators_], axis=0) indices = np.argsort(importances)[::-1] fname_array = np.array(fname_list) if not f_range: f_range = range(indices.shape[0]) n_f = len(f_range) plt.figure() plt.title("Gradient Boost Feature importances") plt.barh(range(n_f), importances[indices[f_range]], color="b", xerr=std[indices[f_range]], ecolor='k',align="center") plt.yticks(range(n_f), fname_array[indices[f_range]]) plt.ylim([-1, n_f]) plt.show() def list_feature_importance(self, fname_list, f_range = list(), return_list = False): importances = self._gb.feature_importances_ indices = np.argsort(importances)[::-1] print 'Gradient Boost feature ranking:' if not f_range : f_range = range(indices.shape[0]) n_f = len(f_range) for i in range(n_f): f = f_range[i] print '{0:d}. feature[{1:d}] {2:s} ({3:f})'.format(f + 1, indices[f], fname_list[indices[f]], importances[indices[f]]) if return_list: return [indices[f_range[i]] for i in range(n_f)]
def do_gbdt4(train_x, train_y, test_x=None, test_y=None, learning_rate=0.03, max_depth=8, max_features=25, n_estimators=600, load=False, save=True, outfile=None, search=False, log=False): if search == False: if log==True: mdl_name = 'gbdt_log_train_lr' + str(learning_rate) + '_n' + str(n_estimators) + '_maxdep' + str(max_depth) + '.pkl' else: mdl_name = 'gbdt_train_lr' + str(learning_rate) + '_n' + str(n_estimators) + '_maxdep' + str(max_depth) + '.pkl' if os.path.exists(mdl_name) == True: clf_gbdt = joblib.load(mdl_name) else: # create gradient boosting clf_gbdt = GradientBoostingClassifier(learning_rate=learning_rate, max_depth=max_depth, max_features=max_features, n_estimators=n_estimators) #n_estimators=500, learning_rate=0.5, max_depth=3) clf_gbdt.fit(train_x, train_y) if save == True: try: _ = joblib.dump(clf_gbdt, mdl_name, compress=1) except: print("*** Save GBM model to pickle failed!!!") if outfile != None: outfile.write("*** Save RF model to pickle failed!!!") if test_x != None and test_y != None: probas_gbdt = clf_gbdt.predict_proba(test_x)[:, 1] score_gbdt = roc_auc_score(test_y, probas_gbdt) print("GBDT ROC score", score_gbdt) return clf_gbdt else: max_depth_list = [ 6, 7, 8, 9, 10] n_list = [2000] lr_list = [0.005,0.003] max_feat_list = [15, 16, 17, 18, 20] info = {} for md in max_depth_list: for n in n_list: for lr in lr_list: for mf in max_feat_list: print 'max_depth = ', md print 'n = ', n print 'learning rate = ', lr print 'max feature = ', mf # n_estimators=500, learning_rate=0.5, max_depth=3) mdl_name = 'gbdt_n'+str(n)+'_lr'+str(lr)+'_md'+str(md)+'mf'+str(mf)+'.pkl' if os.path.exists(mdl_name) == True: clf_gbdt = joblib.load(mdl_name) else: clf_gbdt = GradientBoostingClassifier(learning_rate=learning_rate, max_depth=md,max_features=mf, n_estimators=n_estimators) clf_gbdt.fit(train_x, train_y) _ = joblib.dump(clf_gbdt, mdl_name, compress=1) probas_gbdt = clf_gbdt.predict_proba(test_x)[:, 1] score_gbdt = roc_auc_score(test_y, probas_gbdt) info[md, n, lr, mf] = score_gbdt for md in info: scores = info[md] print('GBDT max_depth = %d, n = %d, lr = %.5f, max_feature = %d, ROC score = %.5f(%.5f)' % ( md[0], md[1], md[2], md[3], scores.mean(), scores.std()))
def gb(train_data,train_label,val_data,val_label,test_data,name="GradientBoosting_submission.csv"): print "start training GradientBoosting..." gbClf = GradientBoostingClassifier() # params: by default gbClf.fit(train_data,train_label) #evaluate on validation set val_pred_label = gbClf.predict_proba(val_data) logloss = preprocess.evaluation(val_label,val_pred_label) print "logloss of validation set:",logloss print "Start classify test set..." test_label = gbClf.predict_proba(test_data) preprocess.saveResult(test_label,filename = name)
def gb_predictedValue(): print '----------GradientBoosting----------' gb_clf = GradientBoostingClassifier(n_estimators = NoOfEstimators) gb_clf.fit(train_df[features], train_df['SeriousDlqin2yrs']) gb_predictedValue = gb_clf.predict_proba(test_df[features]) print 'Feature Importance = %s' % gb_clf.feature_importances_ return gb_predictedValue[:,1]
def machineLearning(X, Y_parameters, predict_value, writer): X_parameters = X clf1 = LinearSVR() clf2 = LinearRegression() clf3 = RandomForestClassifier() clf4 = LogisticRegression() clf5 = DecisionTreeClassifier() clf6 = GradientBoostingClassifier() ##clf1.fit(X_parameters, Y_parameters) #clf2.fit(X_parameters, Y_parameters) #clf3.fit(X_parameters, Y_parameters) clf4.fit(X_parameters, Y_parameters) #clf5.fit(X_parameters, Y_parameters) clf6.fit(X_parameters, Y_parameters) print "finish fitting" answer = [] for line in predict_value: line1 = line[1:] #predict_outcome1 = clf1.predict(line1) #predict_outcome2 = clf2.predict(line1) #predict_outcome3 = clf3.predict_proba(line1) predict_outcome4 = clf4.predict_proba(line1) #predict_outcome5 = clf5.predict_proba(line1) predict_outcome6 = clf6.predict_proba(line1) #value1 = predict_outcome1[0] #value2 = predict_outcome2[0] #value3 = predict_outcome3[0][1] value4 = predict_outcome4[0][1] #value5 = predict_outcome5[0][1] value6 = predict_outcome6[0][1] data = (value4+value6)/2 writer.writerow([line[0],data]) print "finish learning"
def plot_PrecisionRecall (X,y): # Run classifier n_samples, n_features = X.shape # Split into training and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2,random_state=1) clf = GradientBoostingClassifier(n_estimators=400, learning_rate=0.4, max_depth=6) clf.fit(X_train, y_train) probas_ = clf.predict_proba(X_test) # Compute Precision-Recall and plot curve precision, recall, thresholds = precision_recall_curve(y_test, probas_[:,1]) area = auc(recall, precision) print("Area Under Curve: %0.2f" % area) pl.clf() pl.plot(recall, precision, label='Precision-Recall curve') pl.xlabel('Recall') pl.ylabel('Precision') pl.ylim([0.0, 1.05]) pl.xlim([0.0, 1.0]) pl.title('Precision-Recall: AUC=%0.2f' % area) pl.legend(loc="lower left") pl.show()
def train_gbt(filename, color, name): '''Train on Gradient Boosted Trees Classifier''' # Read data data2 = pd.read_csv(filename, encoding="utf") X = data2.ix[:, 1:-1] y = data2.ix[:, -1] # Split into train, validation and test X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) # Define model clf1 = GradientBoostingClassifier(learning_rate=0.05, max_depth=5, random_state=42) # Fit model t0 = time() clf1.fit(X_train, y_train) pred_probas = clf1.predict_proba(X_val) predictions = clf1.predict(X_val) print "Score", clf1.score(X_val, y_val) importances = clf1.feature_importances_ indices = np.argsort(importances)[::-1] # Metrics & Plotting metrics[1, 0] = precision_score(y_val, predictions) metrics[1, 1] = recall_score(y_val, predictions) metrics[1, 2] = f1_score(y_val, predictions) metrics[1, 3] = time() - t0 fpr_rf, tpr_rf, _ = roc_curve(y_val, predictions) plt.plot(fpr_rf, tpr_rf, color=color, label=name) return importances, indices
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises(NotFittedError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def ada_boost(): savefile = open('traindata.pkl', 'rb') (x_train, y_train, t1) = cPickle.load(savefile) savefile.close() savefile = open('testdata.pkl', 'rb') (x_test, t1, name1) = cPickle.load(savefile) savefile.close() # X_train, X_valid, y_train, y_valid = cross_validation.train_test_split( # X, y, test_size=0.1, random_state=42) x_train = np.asarray(x_train,dtype=np.float32) y_train = np.asarray(y_train, dtype='int32')-1 nest = 190 lr = .1 md = 6 # clf1 = DecisionTreeClassifier(max_depth=2) # clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25) clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0) # clf = RandomForestClassifier(n_estimators=200) #.81 # clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81 # clf = KNeighborsClassifier(15) if 1: clf.fit(x_train, y_train) ypred = clf.predict_proba(x_test) y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'] kcsv.print_csv(ypred, name1, y_str,indexname='id') print (nest, lr, md) if 0: multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True) scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss) print scores print (nest, lr, md, scores.mean())
def train(): posi_result = {} train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str) tmp1 = [m < 32 for m in trainTarList] tmp1 = np.array(tmp1) # train_feature = train_feature[tmp1] target_list = np.array(trainTarList) target_list = target_list[tmp1] # train_id_list = np.array(train_id_list) # train_id_list = train_id_list[tmp1] c_feature = trainFeature.columns[:] clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17) clf1.fit(trainFeature[c_feature], target_list) # rf_preds = clf1.predict(test_feature) rf_prob = clf1.predict_proba(test_feature) gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17) gbdt1.fit(trainFeature[c_feature], target_list) # gbdt_preds = gbdt1.predict(test_feature) gbdt_prob = gbdt1.predict_proba(test_feature) all_prob = rf_prob + gbdt_prob all_preds = [] print all_prob.shape for k in range(all_prob.shape[0]): prob1 = list(allProb[k, :]) ind1 = prob.index(max(prob1)) allPreds.append(ind1) for j in range(len(all_preds)): all_pre_name = dl.get_num_position(all_preds[j]) posi_result[test_id_list[j]] = all_pre_name return posi_result
def gradientboost_prediction(features_train, labels_train, features_test, ids): class RandomForestClassifier_compability(RandomForestClassifier): def predict(self, X): return self.predict_proba(X)[:, 1][:,np.newaxis] base_estimator = RandomForestClassifier_compability() clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=5, subsample=0.3, min_samples_split=2, min_samples_leaf=1, max_depth=3, init=base_estimator, random_state=None, max_features=None, verbose=2, learn_rate=None) clf = clf.fit(features_train, labels_train) pred = clf.predict_proba(features_test)[:,1] # feature_importance = clf.feature_importances_ # # print (feature_importance) predictions_file = open("data/rf_prediction.csv", "wb") predictions_file_object = csv.writer(predictions_file) predictions_file_object.writerow(["ID", "TARGET"]) predictions_file_object.writerows(zip(ids, pred)) predictions_file.close()
def main(): makeSub = True featureImportance = False cvfold = True df = pd.read_csv('../data/cprobTrain15NA.csv') X, y = np.array(pd.read_csv('../data/train.csv',usecols=range(1,9))), np.array(pd.read_csv('../data/train.csv').ACTION) X = np.hstack((X,np.array(df))) params = {'max_depth':4, 'subsample':0.5, 'verbose':0, 'random_state':1337, 'min_samples_split':10, 'min_samples_leaf':10, 'max_features':10, 'n_estimators': 350, 'learning_rate': 0.05} clf = GradientBoostingClassifier(**params) prefix = 'lib/gbm350d4m10c15' if cvfold: c = classifier.Classifier(X,y) c.validate(clf,nFolds=10,out=prefix+'Train.csv') if makeSub: Xt = np.array(pd.read_csv('../data/test.csv',usecols=range(1,9))) Xt = np.hstack((Xt,np.array(pd.read_csv('../data/cprobTest15NA.csv')))) clf.fit(X,y) y_ = clf.predict_proba(Xt)[:,1] out = pd.read_csv('subs/nbBaseTest.csv') out.ACTION = y_ out.to_csv(prefix+'Test.csv',index=False) if featureImportance: print "Feature ranking:" importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] np.savetxt('indices.txt',indices,delimiter=',') for f in xrange(df.shape[1]): print "%d. feature (%s,%f)" % (f + 1, df.columns[indices[f]], importances[indices[f]])
def gbc_gp_predict(train_x, train_y, test_x): feature_indexs = getTopFeatures(train_x, train_y) sub_x_Train = get_data( train_x, feature_indexs[:16], features.feature_pair_sub_list, features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20], ) sub_x_Test = get_data( test_x, feature_indexs[:16], features.feature_pair_sub_list, features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20], ) labels = toLabels(train_y) gbc = GradientBoostingClassifier(n_estimators=3000, max_depth=9) gbc.fit(sub_x_Train, labels) pred_probs = gbc.predict_proba(sub_x_Test)[:, 1] ind_test = np.where(pred_probs > 0.55)[0] gp_preds_part = gbc_gp_predict_part(sub_x_Train, train_y, sub_x_Test[ind_test]) gp_preds = np.zeros(len(test_x)) gp_preds[ind_test] = gp_preds_part return gp_preds
def main(args): global verbose verbose = args.verbose # Load files if verbose: logger.info('Loading {}'.format(args.train_file)) train_X, train_y = load_file(args.train_file) if verbose: logger.info('Loading {}'.format(args.test_file)) test_X, test_y = load_file(args.test_file) # # Codes for Grid Search # params = [ # {'n_estimators': [50000], 'learning_rate': [2**i for i in np.arange(-10, -9, .25)], 'max_features': ['log2',], 'max_depth': [7,]}, # ] # method = GradientBoostingClassifier(random_state=1, verbose=1) # gscv = GridSearchCV(method, params, scoring='roc_auc', verbose=verbose, n_jobs=5) # gscv.fit(train_X.toarray(), train_y) # if verbose: # for params, mean_score, all_scores in gscv.grid_scores_: # logger.info('{:.6f} (+/- {:.6f}) for {}'.format(mean_score, all_scores.std() / 2, params)) # logger.info('params:{params}'.format(params=gscv.best_params_)) # logger.info('score:{params}'.format(params=gscv.best_score_)) # pred = gscv.best_estimator_.predict_proba(test_X.toarray()) # Best parameters for the competition data method = GradientBoostingClassifier(n_estimators=50000, learning_rate=2**(-9,5), max_features='log2', max_depth=7 random_state=1, verbose=1) method.fit(train_X.toarray(), train_y) pred = method.predict_proba(test_X.toarray()) np.savetxt(args.output, pred[:, 1], fmt='%.6f') if verbose: logger.info('Wrote preds to {file}'.format(file=args.output)) return 0
def classify2(dis_data, numeric_data, t_label): fold = 5 skf = StratifiedKFold(t_label, fold) roc_auc = 0 f1_score_value = 0 clf1 = LogisticRegression() clf2 = GradientBoostingClassifier() # clf3 = tree.DecisionTreeClassifier(max_depth=500, max_leaf_nodes= 500, class_weight={1:12}) clf3 = GradientBoostingClassifier() for train, test in skf: clf3 = clf3.fit(dis_data.iloc[train], t_label.iloc[train]) #compute auc probas_ = clf3.predict_proba(dis_data.iloc[test]) fpr, tpr, thresholds = roc_curve(t_label.iloc[test], probas_[:, 0]) roc_auc += auc(fpr, tpr) #compute f1_score label_pred = clf3.predict(dis_data.iloc[test]) f1_score_value += f1_score(t_label.iloc[test], label_pred, pos_label= 1) return roc_auc / fold, f1_score_value / fold
def test(self): #iris = datasets.load_iris() #X, y = iris.data, iris.target X, y = self.dataMat,self.labelMat X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.6, random_state=12) #clf = RandomForestClassifier(max_depth=3,min_samples_split=9,min_samples_leaf=15,n_estimators=5) #for w1 in arange(0.342, 0.347, 0.001): params = {'n_estimators': 1200, 'max_depth': 4, 'subsample': 0.5,'learning_rate': 0.01, 'min_samples_leaf': 1, 'random_state': 3}; clf_GBC = GradientBoostingClassifier(**params); clf_GBC.fit(X_train, y_train); scores_GBC = cross_val_score(clf_GBC,X,y,cv=3,scoring='roc_auc') clf_RFC = RandomForestClassifier(max_depth=6,min_samples_split=7, min_samples_leaf=9,n_estimators=12); clf_RFC.fit(X_train, y_train); scores_RFC = cross_val_score(clf_RFC,X,y,cv=3,scoring='roc_auc') clf_SVC = SVC(kernel='linear', C= 0.001, probability=True); clf_SVC.fit(X_train, y_train); scores_SVC = cross_val_score(clf_SVC,X,y,cv=3,scoring='roc_auc') for w1 in arange(0.01, 0.99, 0.01): for w2 in arange(0.01, 0.99, 0.01): y_predprob = clf_GBC.predict_proba(X_test)*w1+clf_RFC.predict_proba(X_test)*(1-w2)*(1-w1)+clf_SVC.predict_proba(X_test)*w2*(1-w1); scoremean = scores_GBC.mean()*w1+scores_RFC.mean()*(1-w2)*(1-w1)+scores_SVC.mean()*w2*(1-w1) if scoremean>0.9: print '***********************************************************' print 'GBC-weight =', w1, 'RFC =',(1-w1)*(1-w2), 'SVC =',w2*(1-w1) print 'The log loss is:', log_loss(y_test, y_predprob) print 'The ROC score is:', roc_auc_score(y_test,y_predprob[:,1]) scorestd = math.sqrt(scores_GBC.std()**2+scores_RFC.std()**2+scores_SVC.std()**2) print ("Accuracy: %0.5f (+/- %0.5f)" % (scores_GBC.mean()*w1+scores_RFC.mean()*(1-w2)*(1-w1)+scores_SVC.mean()*w2*(1-w1), scorestd*2))
def calc_prob(df_features_driver, df_features_other): df_train = df_features_driver.append(df_features_other) df_train.reset_index(inplace = True) df_train.Driver = df_train.Driver.astype(int) # So far, the best result was achieved by using a RandomForestClassifier with Bagging # model = BaggingClassifier(base_estimator = ExtraTreesClassifier()) # model = BaggingClassifier(base_estimator = svm.SVC(gamma=2, C=1)) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = linear_model.LogisticRegression()) # model = BaggingClassifier(base_estimator = AdaBoostClassifier()) #model = RandomForestClassifier(200) # model = BaggingClassifier(base_estimator = [RandomForestClassifier(), linear_model.LogisticRegression()]) # model = EnsembleClassifier([BaggingClassifier(base_estimator = RandomForestClassifier()), # GradientBoostingClassifier]) model = GradientBoostingClassifier(n_estimators = 10000) # model = ExtraTreesClassifier(500, criterion='entropy') feature_columns = df_train.iloc[:, 4:] # Train the classifier model.fit(feature_columns, df_train.Driver) df_submission = pd.DataFrame() df_submission['driver_trip'] = create_first_column(df_features_driver) probs_array = model.predict_proba(feature_columns[:200]) # Return array with the probability for every driver probs_df = pd.DataFrame(probs_array) df_submission['prob'] = np.array(probs_df.iloc[:, 1]) return df_submission
def classify(): ps = {'n_estimators': 155, 'learning_rate': 0.01673821514381137, 'max_depth': 4} xx, y, tags, columns = get_data('/home/rodion/facebids/train/join.count.proba.time.csv') gbdt = GradientBoostingClassifier(**ps) cv = StratifiedKFold(y, 4) for a, b in cv: y_a, y_b = y[a], y[b] xx_a, xx_b = xx[a], xx[b] tags_b = tags[b] gbdt.fit(xx_a, y_a) sort_indices = np.argsort(np.array(gbdt.feature_importances_))[::-1] print(np.asarray(gbdt.feature_importances_)[sort_indices]) print(np.asarray(columns)[sort_indices]) proba = gbdt.predict_proba(xx_b) proba = proba[:, 1] sort_indices = np.argsort(proba) a = np.array([tags_b[sort_indices], y_b[sort_indices], proba[sort_indices]]).T np.savetxt("foo.csv", a, delimiter=",", fmt="%s") break
def gbPredict(LOSS, N_EST, L_RATE, M_DEPT, SUB_S, W_START, N_FOLD, EX_F, TRAIN_DATA_X, TRAIN_DATA_Y, TEST__DATA_X, isProb): # feature extraction ### clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y) ### extA = delFeatMin(clf.feature_importances_, EX_F) ### TRAIN_DATA_X = TRAIN_DATA_X[:, extA] # k-fold validation kf = KFold(TRAIN_DATA_Y.shape[0], n_folds=N_FOLD) tesV = 0.0 for train_index, test_index in kf: X_train, X_test = TRAIN_DATA_X[train_index], TRAIN_DATA_X[test_index] y_train, y_test = TRAIN_DATA_Y[train_index], TRAIN_DATA_Y[test_index] clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(X_train, y_train) tesK = 1 - clf.score(X_test, y_test) tesV += tesK eVal = tesV / N_FOLD # train all data clf = GradientBoostingClassifier(loss=LOSS, n_estimators=N_EST, learning_rate=L_RATE, max_depth=M_DEPT, subsample=SUB_S, warm_start=W_START).fit(TRAIN_DATA_X, TRAIN_DATA_Y) TEST__DATA_X = TEST__DATA_X[:, extA] if isProb: data = clf.predict_proba(TEST__DATA_X) else: data = clf.predict(TEST__DATA_X) print "Eval =", eVal, "with n_esti =", N_EST, "l_rate =", L_RATE, "m_dep =", M_DEPT, "sub_s =", SUB_S, "ex_num =", EX_F, "and loss is", LOSS return (data, eVal)
def do_all_study(X,y): names = [ "Decision Tree","Gradient Boosting", "Random Forest", "AdaBoost", "Naive Bayes"] classifiers = [ #SVC(), DecisionTreeClassifier(max_depth=10), GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1), RandomForestClassifier(max_depth=10, n_estimators=20, max_features=1), AdaBoostClassifier()] for name, clf in zip(names, classifiers): estimator,score = plot_learning_curve(clf, X_train, y_train, scoring='roc_auc') clf_GBC = GradientBoostingClassifier(max_depth=10, n_estimators=20, max_features=1) param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_GBC, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_GBC.fit(X_train,y_train) y_pred_GBC = clf_GBC.predict_proba(X_test)[:,1] print("ROC AUC GradientBoostingClassifier: %0.4f" % roc_auc_score(y_test, y_pred_GBC)) clf_AB = AdaBoostClassifier() param_name = 'n_estimators' param_range = [1, 5, 10, 20,40] plot_validation_curve(clf_AB, X_train, y_train, param_name, param_range, scoring='roc_auc') clf_AB.fit(X_train,y_train) y_pred_AB = clf_AB.predict_proba(X_test)[:,1] print("ROC AUC AdaBoost: %0.4f" % roc_auc_score(y_test, y_pred_AB))
def main(): # Set seed for reproducibility seed = np.random.seed(42) print("# Loading data...") train = pd.read_csv('./datasets/numerai_training_data.csv', header=0) selected_features = pd.read_csv('./datasets/x_new.csv', header=0) tournament = pd.read_csv('./datasets/numerai_tournament_data.csv', header=0) validation = tournament[tournament['data_type'] == 'validation'] train_bernie = train features = [f for f in list(selected_features) if "feature" in f] X = train_bernie[features] Y = train_bernie['target_bernie'] x_prediction = validation[features] ids = tournament['id'] #CONFIGURE YOUR MODELS: #Stochastic Gradient Boosting Classification num_trees = 25 kfold = model_selection.KFold(n_splits=len(train['era'].unique()), random_state=seed) #Configure model modelGBC = GradientBoostingClassifier(n_estimators=num_trees, random_state=seed, verbose=2) #Train and test with kfold model iterations #results = model_selection.cross_val_score(modelGBC, X, Y, cv=kfold) #print(results.mean()) #COMMENT IF YOU DON'T WANT TO SAVE THE TRAINED MODEL joblib.dump(modelGBC, './models/gradient_boosting_classifier.joblib') #UNCOMMENT IF WANT TO LOAD THE TRAINED MODEL # modelGBC = joblib.load('gradient_classifier.joblib') modelGBC.fit(X, Y) #USED TRAINED MODELS AND TEST THEM AGAINST THE TEST SET (x_prediction is the validation set) y_prediction = modelGBC.predict_proba(x_prediction) probabilities = y_prediction[:, 1] print(probabilities) print("- probabilities GBC:", probabilities[1:6]) print("- target:\n", validation['target_bernie'][1:6]) print("- rounded probability:", [round(p) for p in probabilities][1:6]) correct = [ round(x) == y for (x, y) in zip(probabilities, validation['target_bernie']) ] print("- accuracy: ", sum(correct) / float(validation.shape[0])) print("- validation logloss:", metrics.log_loss(validation['target_bernie'], probabilities)) # # To submit predictions from your model to Numerai, predict on the entire tournament data. print("PREDICTIONS FOR THE TOURNAMENT *******************") x_prediction = tournament[features] print("\nPREDICTIONS USING GBC") y_prediction = modelGBC.predict_proba(x_prediction) results = y_prediction[:, 1] #results = np.round_(results) results_GBC = pd.DataFrame(data={'probability_bernie': results}) joined = pd.DataFrame(ids).join(results_GBC) print("- joined:", joined.head()) print("# Writing predictions to bernie_submissions_gbc.csv...") # Save the predictions out to a CSV file. # print("# Creating submission...") joined.to_csv("./results/bernie_submission_gbc.csv", index=False)
loss_train_by_iter = [] loss_test_by_iter = [] for predict in predict_train_by_iter: loss_value = log_loss(y_train, sigmoid(predict)) loss_train_by_iter.append(loss_value) for predict in predict_test_by_iter: loss_value = log_loss(y_test, sigmoid(predict)) loss_test_by_iter.append(loss_value) min_loss_index = np.argmin(loss_test_by_iter) print('learning_rate=%s, min_loss_value=%s, iteration(from 1)=%s' % ( learning_rate, loss_test_by_iter[min_loss_index], min_loss_index + 1 )) plt.title(learning_rate) plt.plot(loss_train_by_iter) plt.plot(loss_test_by_iter) plt.show() clf = RandomForestClassifier(n_estimators=37, random_state=241) clf.fit(X_train, y_train) prediction = clf.predict_proba(X_test) loss_value = log_loss(y_test, prediction) print('Random forest classifier min loss value = ', loss_value)
n_folds = 5 skf = list(StratifiedKFold(y, n_folds)) for j, clf in enumerate(clfs): '''依次训练各个单模型''' # print(j, clf) dataset_blend_test_j = np.zeros((X_predict.shape[0], len(skf))) for i, (train, test) in enumerate(skf): '''使用第i个部分作为预测,剩余的部分来训练模型,获得其预测的输出作为第i部分的新特征。''' # print("Fold", i) X_train, y_train, X_test, y_test = X[train], y[train], X[test], y[test] clf.fit(X_train, y_train) y_submission = clf.predict_proba(X_test)[:, 1] dataset_blend_train[test, j] = y_submission dataset_blend_test_j[:, i] = clf.predict_proba(X_predict)[:, 1] '''对于测试集,直接用这k个模型的预测值均值作为新的特征。''' dataset_blend_test[:, j] = dataset_blend_test_j.mean(1) print("val auc Score: %f" % roc_auc_score(y_predict, dataset_blend_test[:, j])) # clf = LogisticRegression() clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=30) clf.fit(dataset_blend_train, y) y_submission = clf.predict_proba(dataset_blend_test)[:, 1] print("Linear stretch of predictions to [0,1]") y_submission = (y_submission - y_submission.min()) / (y_submission.max() - y_submission.min()) print("blend result") print("val auc Score: %f" % (roc_auc_score(y_predict, y_submission)))
# In[ ]: predictions_GBC=GBC.predict(X_test) # In[ ]: print(classification_report(y_test,predictions_GBC)) # In[ ]: predictions_GBC_prob=GBC.predict_proba(X_test) prob_list_GBC= [x[1] for x in predictions_GBC_prob] d_GBC={} for threshold in np.arange(0.0, 1.0, 0.01): list_for_check_GBC=np.int_([y>=threshold for y in prob_list_GBC]) d_GBC[threshold]=f1_score(y_test,list_for_check_GBC) df_GBC=pd.DataFrame.from_dict(d_GBC,orient='index') # In[ ]: df_GBC[df_GBC[0]==df_GBC[0].max()] # In[ ]:
# 弱分类器的数目 n_estimator = 10 # 调用GBDT分类模型 grd = GradientBoostingClassifier(n_estimators=n_estimator) # 调用one-hot编码。 grd_enc = OneHotEncoder() # 调用LR分类模型。 grd_lm = LogisticRegression() # 使用X_train训练GBDT模型,后面用此模型构造特征 grd.fit(X_train, y_train) # 直接进行预测,查看AUC得分 y_pred_grd = grd.predict_proba(X_test)[:, 1] fpr_grd, tpr_grd, _ = metrics.roc_curve(y_test, y_pred_grd) roc_auc = metrics.auc(fpr_grd, tpr_grd) print 'predict', roc_auc # fit one-hot编码器 tmp = grd.apply(X_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) # 使用训练好的GBDT模型构建特征,然后将特征经过one-hot编码作为新的特征输入到LR模型训练。 grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) # 用训练好的LR模型多X_test做预测
print(search.best_estimator_) print(search.best_score_) #evaluation prediction_train = GBDT.predict(feature_train_scaled) cm_train = confusion_matrix(y_train_decode, prediction_train) prediction_test = GBDT.predict(feature_test_scaled) cm_test = confusion_matrix(y_test_decode, prediction_test) print( "Confusion matrix for training dataset is \n%s\n for testing dataset is \n%s.\n" % (cm_train, cm_test)) target_names = [ 'class 1', 'class 2', 'class 3', 'class 4', 'class 5', 'class 6', 'class 7', 'class 8', 'class 9' ] print( classification_report(y_test_decode, prediction_test, target_names=target_names)) y_score = GBDT.predict_proba(feature_test_scaled) # 计算micro类型的AUC # print('调用函数auc:', roc_auc_score(y_test, y_score, average='micro')) fpr, tpr, thresholds = roc_curve(y_test.ravel(), y_score.ravel()) micro_auc = auc(fpr, tpr) print('micro_auc:', micro_auc)
if __name__ == '__main__': x_data, y_data = load_data() X = np.array(x_data) Y = np.array(y_data) x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=5) # 构造并训练GB分类器模型 clf = GradientBoostingClassifier(random_state=0) clf.fit(x_train, y_train) # 预测分类结果 y_predict = clf.predict(x_test) y_pred_probability = clf.predict_proba(x_test) df2 = pd.DataFrame(y_pred_probability) proba_pred_y = np.array(df2[1]) # 截取样本点预测为正样本的预测概率 score = clf.score(x_test, y_test) print("Gradient Boosting 模型打分: Score = %f" % score) accuracy = Get_Accuracy(y_test, y_predict) print("Gradient Boosting Accuracy_Score = %f" % accuracy) precision = Get_Precision_score(y_test, y_predict) print("Gradient Boosting Precision = %f" % precision) recall = Get_Recall(y_test, y_predict) print("Gradient Boosting Recall = %f" % recall) f1_score = Get_f1_score(y_test, y_predict) print("Gradient Boosting F1-Score = %f" % f1_score) auc = Get_Auc_value(y_test, proba_pred_y)
def compare_assessors(X, y): n_estimator = 20 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) # It is important to train the ensemble of trees on a different subset # of the training data than the linear regression model to avoid # overfitting, in particular if the total number of leaves is # similar to the number of training samples X_train, X_train_lr, y_train, y_train_lr = train_test_split(X_train, y_train, test_size=0.1) # Unsupervised transformation based on totally random trees rt = RandomTreesEmbedding(n_estimators=n_estimator, random_state=0) rt_lm = LogisticRegression() pipeline = make_pipeline(rt, rt_lm) pipeline.fit(X_train, y_train) y_pred_rt = pipeline.predict_proba(X_test)[:, 1] fpr_rt_lm, tpr_rt_lm, _ = roc_curve(y_test, y_pred_rt) # Supervised transformation based on random forests rf = RandomForestClassifier(n_estimators=n_estimator) rf.fit(X_train, y_train) y_pred_rf = rf.predict_proba(X_test)[:, 1] fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf) # RF + LR rf_enc = OneHotEncoder() rf_enc.fit(rf.apply(X_train)) rf_lm = LogisticRegression() rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) # GBT grd = GradientBoostingClassifier(n_estimators=n_estimator) grd.fit(X_train, y_train) y_pred_grd = grd.predict_proba(X_test)[:, 1] fpr_grd, tpr_grd, _ = roc_curve(y_test, y_pred_grd) grd.score(X_train, y_train) grd.score(X_test, y_test) # GBT + LR grd_enc = OneHotEncoder() grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm = LogisticRegression() grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) plt.figure(1) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_rt_lm, tpr_rt_lm, label='RT + LR') plt.plot(fpr_rf, tpr_rf, label='RF') plt.plot(fpr_rf_lm, tpr_rf_lm, label='RF + LR') plt.plot(fpr_grd, tpr_grd, label='GBT') plt.plot(fpr_grd_lm, tpr_grd_lm, label='GBT + LR') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') plt.legend(loc='best') plt.show()
fake_test_data = np.squeeze(images[bdt_train_size:]) real_training_labels = np.ones(bdt_train_size) fake_training_labels = np.zeros(bdt_train_size) total_training_data = np.concatenate( (real_training_data, fake_training_data)) total_training_labels = np.concatenate( (real_training_labels, fake_training_labels)) clf.fit(total_training_data, total_training_labels) out_real = clf.predict_proba(real_test_data) out_fake = clf.predict_proba(fake_test_data) if mode != "ROC_testing": plt.hist([out_real[:, 1], out_fake[:, 1]], bins=100, label=['real', 'gen'], histtype='step') plt.xlabel('Output of BDT') plt.legend(loc='upper right') plt.savefig('%s%s/BDT_out.png' % (working_directory, saving_directory), bbox_inches='tight') plt.close('all')
IDcol = 'ID' #将Disbursed字段的值分类统计数目,0值多少个,1的值多少个 train['Disbursed'].value_counts() #挑选不是Disbursed和ID的列 x_columns = [x for x in train.columns if x not in [target, IDcol]] #X为因子矩阵 X = train[x_columns] #y是结果矩阵 y = train['Disbursed'] #gbm0为 gbm0 = GradientBoostingClassifier(random_state=10) gbm0.fit(X, y) #根据X值预测 y_pred = gbm0.predict(X) #每个X样本为1的概率 y_predprob = gbm0.predict_proba(X)[:, 1] #打印分类准确的百分比。 print "Accuracy : %.4g" % metrics.accuracy_score(y.values, y_pred) #直接根据真实值(必须是二值)、预测值(可以是0/1,也可以是proba值)计算出auc值,中间过程的roc计算省略 print "AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob) #range(start,stop ,step) param_test1 = {'n_estimators': range(20, 81, 10)} gsearch1 = GridSearchCV(estimator=GradientBoostingClassifier( learning_rate=0.1, min_samples_split=300, min_samples_leaf=20, max_depth=8, max_features='sqrt', subsample=0.8, random_state=10), param_grid=param_test1,
X_train, X_test, y_train_named, y_test_named, y_train, y_test = train_test_split( X, y_named, y, random_state=0) # Build the gradient boosting model gbrt = GradientBoostingClassifier(random_state=0).fit(X_train, y_train) print("X_test.shape: {}".format(X_test.shape)) print("Decision function shape: {}".format( gbrt.decision_function(X_test).shape)) df = gbrt.decision_function(X_test) print("Thresholded decision function:\n{}".format( gbrt.decision_function(X_test) > 0)) greater_zero = (gbrt.decision_function(X_test) > 0).astype(int) # Predicting probabilities print("Shape of probabilities: {}".format(gbrt.predict_proba(X_test).shape)) pp = gbrt.predict_proba(X_test) print(pp[0, 0] + pp[0, 1]) # Visualization fig, axes = plt.subplots(1, 2, figsize=(13, 5)) mglearn.tools.plot_2d_separator(gbrt, X, ax=axes[0], alpha=.4, fill=True, cm=mglearn.cm2) scores_image = mglearn.tools.plot_2d_scores(gbrt, X, ax=axes[1], alpha=.4,
'home_ownership', 'verification_status', 'desc_clean', 'purpose', 'zip_code', 'addr_state', 'pub_rec_bankruptcies_clean' ] v = DictVectorizer(sparse=False) X1 = v.fit_transform(trainData[cat_features].to_dict('records')) #将独热编码和数值型变量放在一起进行模型训练 X2 = np.matrix(trainData[num_features]) X = np.hstack([X1, X2]) y = trainData['y'] # 未经调参进行GBDT模型训练 gbm0 = GradientBoostingClassifier(random_state=10) gbm0.fit(X, y) y_pred = gbm0.predict(X) y_predprob = gbm0.predict_proba(X)[:, 1].T print "Accuracy : %.4g" % metrics.accuracy_score(y, y_pred) print "AUC Score (Train): %f" % metrics.roc_auc_score(np.array(y.T), y_predprob) ''' 第四步:在测试集上测试模型的性能 ''' # 将带%的百分比变为浮点数 testData['int_rate_clean'] = testData['int_rate'].map( lambda x: float(x.replace('%', '')) / 100) # 将工作年限进行转化,否则影响排序 testData['emp_length_clean'] = testData['emp_length'].map(CareerYear) # 将desc的缺失作为一种状态,非缺失作为另一种状态 testData['desc_clean'] = testData['desc'].map(DescExisting) # 处理日期。earliest_cr_line的格式不统一,需要统一格式且转换成python的日期 testData['app_date_clean'] = testData['issue_d'].map(
shu = data_21 X = shu X = scale(shu) y = label sepscores = [] cv_clf = GradientBoostingClassifier(n_estimators=2000, max_depth=6, learning_rate=0.01) skf = StratifiedKFold(n_splits=5) ytest = np.ones((1, 2)) * 0.5 yscore = np.ones((1, 2)) * 0.5 for train, test in skf.split(X, y): y_train = utils.to_categorical(y[train]) hist = cv_clf.fit(X[train], y[train]) y_score = cv_clf.predict_proba(X[test]) yscore = np.vstack((yscore, y_score)) y_test = utils.to_categorical(y[test]) ytest = np.vstack((ytest, y_test)) fpr, tpr, _ = roc_curve(y_test[:, 0], y_score[:, 0]) roc_auc = auc(fpr, tpr) y_class = utils.categorical_probas_to_classes(y_score) y_test_tmp = y[test] acc, precision, npv, sensitivity, specificity, mcc, f1 = utils.calculate_performace( len(y_class), y_class, y_test_tmp) sepscores.append( [acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc]) print( 'GTB:acc=%f,precision=%f,npv=%f,sensitivity=%f,specificity=%f,mcc=%f,f1=%f,roc_auc=%f' % (acc, precision, npv, sensitivity, specificity, mcc, f1, roc_auc)) scores = np.array(sepscores)
print(type(X.toarray())) print(X.todense().shape) # In[11]: # create dataframe import pandas as pd df = pd.DataFrame(X.toarray()) df.columns = cols df["label"] = y df.head() # In[12]: # test y_pred = [x[1] for x in sk_gbt.predict_proba(df[cols])] df["pred"] = y_pred df.head() # In[13]: # auc def auc(y_true, y_pred): """ calculate auc Args: y_true: label y_pred: predict Return: auc
print('\nPredicted probabilities:') print(adaboost_y_pred_prob[:5, ]) print("Error: {0:.2f}".format(adaboost.estimator_errors_[0])) print("Tree importance: {0:.2f}".format(adaboost.estimator_weights_[0])) # GradientBoosting Trees gbc = GradientBoostingClassifier(max_depth=1, n_estimators=1000, warm_start=True, random_state=seed) gbc.fit(x_train, y_train) # predictions gbc_y_pred = gbc.predict(x_test) gbc_y_pred_prob = gbc.predict_proba(x_test) # log loss gbc_accuracy = accuracy_score(y_test, gbc_y_pred) gbc_logloss = log_loss(y_test, gbc_y_pred_prob) print("== Gradient Boosting ==") print("Accuracy: {0:.2f}".format(gbc_accuracy)) print("Log loss: {0:.2f}".format(gbc_logloss)) print("True labels:") print(y_test[:5, ]) print('\nPredicted labels:') print(gbc_y_pred[:5, ]) print('\nPredicted probabilities:') print(gbc_y_pred_prob[:5, ])
# find patients with a certain disease in target domain target_train_feature_true = train_ori.loc[:, disease_list.iloc[disease_num, 0]] > 0 target_train_meaningful_sample = train_ori.loc[target_train_feature_true] # get patients with small disease in test dataset (target domain's test sample) target_test_feature_true = test_ori.loc[:, disease_list.iloc[disease_num, 0]] > 0 target_test_meaningful_sample = test_ori.loc[target_test_feature_true] X_test = target_test_meaningful_sample.drop(['Label'], axis=1) y_test = target_test_meaningful_sample['Label'] # # transfer to X_test # fit_test = X_test * Weight_importance_source_data # fit_test = fit_test * Weight_importance_from_middle_data # use source model to predict each group disease's AUC y_predict_by_source_model = gbm_All.predict_proba(X_test)[: , 1] auc_by_source_model = roc_auc_score(y_test , y_predict_by_source_model) auc_source_dataframe.loc[disease_list.iloc[disease_num , 0] , auc_global_dataframe_columns[data_num - 1]] = auc_by_source_model # use middle model to predict each group disease's AUC y_predict_by_middle_model = gbm_large_group.predict_proba(X_test)[:, 1] auc_by_middle_model = roc_auc_score(y_test, y_predict_by_middle_model) auc_middle_dataframe.loc[disease_list.iloc[disease_num, 0], auc_global_dataframe_columns[data_num - 1]] = auc_by_middle_model # 按不同的sample_size,df.sample进行随机抽样 for frac in sample_size: auc_list = [] i = 0 while i < 10: # random sampling for test auc random_sampling_train_meaningful_sample = target_train_meaningful_sample.sample(frac=frac, axis=0)
from sklearn.ensemble import RandomForestClassifier forest = RandomForestClassifier(n_estimators=5, random_state=2) forest.fit(X_train, y_train) #---------------- # Gradient boosting from sklearn.ensemble import GradientBoostingClassifier gbrt = GradientBoostingClassifier(learning_rate=0.01,random_state=0) gbrt.fit(X_train, y_train) print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train))) print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test))) print(gbrt.predict_proba(X_test[:16])) #---------------- # SVM - важна предобработка from sklearn.svm import SVC svc = SVC() svc.fit(X_train, y_train) print("Accuracy on training set: {:.2f}".format(svc.score(X_train, y_train))) print("Accuracy on test set: {:.2f}".format(svc.score(X_test, y_test))) #---------------- # MLPClassifier - многослойный перцептрон от sklearn - важна предобработка from sklearn.neural_network import MLPClassifier mlp = MLPClassifier(solver='lbfgs', random_state=0, hidden_layer_sizes=[10])
def main(): # load csv files df = pd.read_csv('/home/clintone/MIMICmaterialized/oasis.csv') # Create dataframe with icustay_id and icustay_expire_flag df_flag = df[['icustay_id', 'icustay_age_group', 'icustay_expire_flag']].copy() # create target variable y = df['icustay_expire_flag'].copy() # create X variable X = df[['age_score', 'preiculos_score', 'gcs_score', 'heartrate_score', \ 'meanbp_score', 'resprate_score', 'temp_score','urineoutput_score', \ 'mechvent_score','electivesurgery_score']].copy() # train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.33, random_state=0, stratify=y) # Train and fit model rf = GradientBoostingClassifier(random_state=0, learning_rate= 0.01, \ max_features='sqrt', max_leaf_nodes=12, n_estimators=1250 ) # Created dataframe with subsection of X_test data # We will be using this to get the predicted probablities from the published OASIS model. # We will use these predicted probabilites to creat our ROC curve, and use that as our baseline/yardstick. y_ind_prob = df.loc[X_test.index] # Fit model rf.fit(X_train, y_train) # Test Prediction pred = rf.predict(X_test) print('Accuracy score: {:.3}'.format(rf.score(X_test, y_test))) # Get predicted probabilites y_predict_proba = rf.predict_proba(X_test) # Get predicted probabilites of 1 (Death) y_proba = y_predict_proba[:, 1] # Get AUROC score print('AUROC: {:.3}'.format(roc_auc_score(y_test, y_proba))) # Calculate Standard Mortality Rate (SMR) SMR = sum(y_test) / sum(pred) print('SMR: {:.3}'.format(SMR)) # (different way) print('SMR: {:.3}'.format(sum(y_test)/sum(pred))) # Calculate Brier score the long way difference = y_proba - y_test squared = np.square(difference) Brier = np.mean(squared) print('Brier Score: {:.3}'.format(Brier)) # I later found out that SkLearn has its own method to calculate Brier score, I added this as a check to make sure my code was correct. print('Brier Score [SKLEARN]: {:.3}'.format( brier_score_loss(y_test, y_proba))) # (different way) to do the above ---> print('Brier Score: {:.3}'.format(np.mean(np.square(y_proba - y_test)))) # This is to calculate Brier score for the published OASIS predicted scores print('Brier Score [IND]: {:.3}'.format( np.mean(np.square(y_ind_prob['oasis_prob'] - y_test)))) # calculate the fpr and tpr for all thresholds of the classification # probs = model.predict_proba(X_test) # preds = probs[:,1] fpr, tpr, threshold = roc_curve(y_test, y_proba) roc_auc = auc(fpr, tpr) # ROC curve for published OASIS model fpr_IND, tpr_IND, threshold = roc_curve(y_test, y_ind_prob['oasis_prob']) roc_auc_IND = auc(fpr_IND, tpr_IND) # Plot ROC curves plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, 'b', label='AUC_OASIS = %0.3f' % roc_auc) plt.plot(fpr_IND, tpr_IND, 'g', label='AUC_IND = %0.3f' % roc_auc_IND) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()
#make predictions for test data y_pred_xgb = xgb_clf.predict(X_test) predictions = [round(value) for value in y_pred_xgb] # evaluate predictions from sklearn.metrics import accuracy_score accuracy = accuracy_score(y_test, predictions) print("Accuracy: %.2f%%" % (accuracy * 100.0)) # In[38]: #plot ROC y_prob_gb = gb_clf.predict_proba(X_test) y_score_gb = y_prob_gb[:, 1] fpr_gb, tpr_gb, threshold_gb = roc_curve(y_test, y_score_gb) auc = accuracy_score(y_test, y_pred_gb) plt.plot(fpr_gb, tpr_gb, label='Gradient Boosting Classifier,auc = %0.2f' % auc) y_prob_xgb = xgb_clf.predict_proba(X_test) y_score_xgb = y_prob_xgb[:, 1] fpr_xgb, tpr_xgb, threshold_xgb = roc_curve(y_test, y_score_xgb) auc = accuracy_score(y_test, y_pred_xgb) plt.plot(fpr_xgb, tpr_xgb, label='XGBoosting Classifier,auc = %0.2f' % auc) # ROC curve plotting plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
def GB(x_train,y_train,x_test): GB=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1,max_depth=6) GB.fit(x_train,y_train) return GB.predict_proba(x_test)
clf.fit(X_bags, y) score = cross_val_score(estimator=clf, X=X_bags, y=y, cv=folds, scoring='roc_auc').mean() print('Logistic Regression with bag-of-words, ROC-AUC Score: {0:.6f}'.format(score), '\n') # What is the minimum\maximum value of the forecast on # the test sample came from the best of the algorithms? if not Bags: clf.fit(X_bags, y) features_test = read_csv('features_test.csv', index_col='match_id') features_test_raw = read_csv('features_test.csv', index_col='match_id') features_test.drop(heroes+lt+st, inplace=True, axis=1) features_test.fillna(0, inplace=True) X_test_no_bag = features_test.ix[:, :] X_test_no_bag = scale(X_test_no_bag) # Bag-of-words for the heroes of the test data: X_pick = np.zeros((len(features_test_raw), N_words)) for i, match_id in enumerate(features_test_raw.index): for p in range(5): X_pick[i, features_test_raw.ix[match_id, 'r%d_hero' % (p + 1)] - 1] = 1 X_pick[i, features_test_raw.ix[match_id, 'd%d_hero' % (p + 1)] - 1] = -1 X_test = np.concatenate([X_test_no_bag, X_pick], axis=1) # Min/max values: proba = clf.predict_proba(X_test)[:, 1] pmax, pmin = np.amax(proba), np.amin(proba) print('Max proba: {0:.6f},\nMin proba: {1:.6f}'.format(pmax, pmin))
print 'attributes with gaps \n{}'.format( train_X.count()[lambda x: x < len(train_X)]) train_X = train_X.fillna(0).as_matrix() cv = KFold(len(train_y), n_folds=5, shuffle=True, random_state=241) for estimators in range(10, 31, 10): clf = GradientBoostingClassifier(n_estimators=estimators, random_state=241) start_time = datetime.datetime.now() auc_score = [] for traincv, testcv in cv: clf.fit(train_X[traincv], train_y[traincv]) pred = clf.predict_proba(train_X[testcv])[:, 1] auc_score.append(metrics.roc_auc_score(train_y[testcv], pred)) elapsed_time = datetime.datetime.now() - start_time print 'estimators: {0} , auc score: {1:.2f}, time elapsed: {2}'.format( estimators, np.mean(auc_score), elapsed_time) # Part 2 logistic regression print "Logistic regression" heroes_columns = [ 'r1_hero', 'r2_hero', 'r3_hero', 'r4_hero', 'r5_hero', 'd1_hero', 'd2_hero', 'd3_hero', 'd4_hero', 'd5_hero' ]
class MembershipInferenceBlackBox(MembershipInferenceAttack): """ Implementation of a learned black-box membership inference attack. This implementation can use as input to the learning process probabilities/logits or losses, depending on the type of model and provided configuration. """ attack_params = MembershipInferenceAttack.attack_params + [ "input_type", "attack_model_type", "attack_model", ] _estimator_requirements = (BaseEstimator, (ClassifierMixin, RegressorMixin)) def __init__( self, estimator: Union["CLASSIFIER_TYPE", "REGRESSOR_TYPE"], input_type: str = "prediction", attack_model_type: str = "nn", attack_model: Optional[Any] = None, ): """ Create a MembershipInferenceBlackBox attack instance. :param estimator: Target estimator. :param attack_model_type: the type of default attack model to train, optional. Should be one of `nn` (for neural network, default), `rf` (for random forest) or `gb` (gradient boosting). If `attack_model` is supplied, this option will be ignored. :param input_type: the type of input to train the attack on. Can be one of: 'prediction' or 'loss'. Default is `prediction`. Predictions can be either probabilities or logits, depending on the return type of the model. If the model is a regressor, only `loss` can be used. :param attack_model: The attack model to train, optional. If none is provided, a default model will be created. """ super().__init__(estimator=estimator) self.input_type = input_type self.attack_model_type = attack_model_type self.attack_model = attack_model self._regressor_model = RegressorMixin in type(self.estimator).__mro__ self._check_params() if self.attack_model: self.default_model = False self.attack_model_type = "None" else: self.default_model = True if self.attack_model_type == "nn": import torch # lgtm [py/repeated-import] lgtm [py/import-and-import-from] from torch import nn # lgtm [py/repeated-import] class MembershipInferenceAttackModel(nn.Module): """ Implementation of a pytorch model for learning a membership inference attack. The features used are probabilities/logits or losses for the attack training data along with its true labels. """ def __init__(self, num_classes, num_features=None): self.num_classes = num_classes if num_features: self.num_features = num_features else: self.num_features = num_classes super().__init__() self.features = nn.Sequential( nn.Linear(self.num_features, 512), nn.ReLU(), nn.Linear(512, 100), nn.ReLU(), nn.Linear(100, 64), nn.ReLU(), ) self.labels = nn.Sequential( nn.Linear(self.num_classes, 256), nn.ReLU(), nn.Linear(256, 64), nn.ReLU(), ) self.combine = nn.Sequential( nn.Linear(64 * 2, 1), ) self.output = nn.Sigmoid() def forward(self, x_1, label): """Forward the model.""" out_x1 = self.features(x_1) out_l = self.labels(label) is_member = self.combine(torch.cat((out_x1, out_l), 1)) return self.output(is_member) if self.input_type == "prediction": num_classes = estimator.nb_classes # type: ignore self.attack_model = MembershipInferenceAttackModel(num_classes) else: if self._regressor_model: self.attack_model = MembershipInferenceAttackModel(1, num_features=1) else: num_classes = estimator.nb_classes # type: ignore self.attack_model = MembershipInferenceAttackModel(num_classes, num_features=1) self.epochs = 100 self.batch_size = 100 self.learning_rate = 0.0001 elif self.attack_model_type == "rf": self.attack_model = RandomForestClassifier() elif self.attack_model_type == "gb": self.attack_model = GradientBoostingClassifier() def fit( # pylint: disable=W0613 self, x: np.ndarray, y: np.ndarray, test_x: np.ndarray, test_y: np.ndarray, pred: Optional[np.ndarray] = None, test_pred: Optional[np.ndarray] = None, **kwargs ): """ Train the attack model. :param x: Records that were used in training the target estimator. :param y: True labels for `x`. :param test_x: Records that were not used in training the target estimator. :param test_y: True labels for `test_x`. :param pred: Estimator predictions for the records, if not supplied will be generated by calling the estimators' `predict` function. Only relevant for input_type='prediction'. :param test_pred: Estimator predictions for the test records, if not supplied will be generated by calling the estimators' `predict` function. Only relevant for input_type='prediction'. :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member. """ if self.estimator.input_shape is not None: if self.estimator.input_shape[0] != x.shape[1]: # pragma: no cover raise ValueError("Shape of x does not match input_shape of estimator") if self.estimator.input_shape[0] != test_x.shape[1]: # pragma: no cover raise ValueError("Shape of test_x does not match input_shape of estimator") if not self._regressor_model: y = check_and_transform_label_format(y, len(np.unique(y)), return_one_hot=True) test_y = check_and_transform_label_format(test_y, len(np.unique(test_y)), return_one_hot=True) if y.shape[0] != x.shape[0]: # pragma: no cover raise ValueError("Number of rows in x and y do not match") if test_y.shape[0] != test_x.shape[0]: # pragma: no cover raise ValueError("Number of rows in test_x and test_y do not match") # Create attack dataset # uses final probabilities/logits if self.input_type == "prediction": # members if pred is None: features = self.estimator.predict(x).astype(np.float32) else: features = pred.astype(np.float32) # non-members if test_pred is None: test_features = self.estimator.predict(test_x).astype(np.float32) else: test_features = test_pred.astype(np.float32) # only for models with loss elif self.input_type == "loss": # members features = self.estimator.compute_loss(x, y).astype(np.float32).reshape(-1, 1) # non-members test_features = self.estimator.compute_loss(test_x, test_y).astype(np.float32).reshape(-1, 1) else: # pragma: no cover raise ValueError("Illegal value for parameter `input_type`.") # members labels = np.ones(x.shape[0]) # non-members test_labels = np.zeros(test_x.shape[0]) x_1 = np.concatenate((features, test_features)) x_2 = np.concatenate((y, test_y)) y_new = np.concatenate((labels, test_labels)) if self._regressor_model: x_2 = x_2.astype(np.float32).reshape(-1, 1) if self.default_model and self.attack_model_type == "nn": import torch # lgtm [py/repeated-import] lgtm [py/import-and-import-from] from torch import nn # lgtm [py/repeated-import] from torch import optim # lgtm [py/repeated-import] from torch.utils.data import DataLoader # lgtm [py/repeated-import] from art.utils import to_cuda loss_fn = nn.BCELoss() optimizer = optim.Adam(self.attack_model.parameters(), lr=self.learning_rate) # type: ignore attack_train_set = self._get_attack_dataset(f_1=x_1, f_2=x_2, label=y_new) train_loader = DataLoader(attack_train_set, batch_size=self.batch_size, shuffle=True, num_workers=0) self.attack_model = to_cuda(self.attack_model) # type: ignore self.attack_model.train() # type: ignore for _ in range(self.epochs): for (input1, input2, targets) in train_loader: input1, input2, targets = to_cuda(input1), to_cuda(input2), to_cuda(targets) _, input2 = torch.autograd.Variable(input1), torch.autograd.Variable(input2) targets = torch.autograd.Variable(targets) optimizer.zero_grad() outputs = self.attack_model(input1, input2) # type: ignore loss = loss_fn(outputs, targets.unsqueeze(1)) # lgtm [py/call-to-non-callable] loss.backward() optimizer.step() else: y_ready = check_and_transform_label_format(y_new, len(np.unique(y_new)), return_one_hot=False) self.attack_model.fit(np.c_[x_1, x_2], y_ready) # type: ignore def infer(self, x: np.ndarray, y: Optional[np.ndarray] = None, **kwargs) -> np.ndarray: """ Infer membership in the training set of the target estimator. :param x: Input records to attack. :param y: True labels for `x`. :param probabilities: a boolean indicating whether to return the predicted probabilities per class, or just the predicted class :return: An array holding the inferred membership status, 1 indicates a member and 0 indicates non-member, or class probabilities. """ if y is None: # pragma: no cover raise ValueError("MembershipInferenceBlackBox requires true labels `y`.") if self.estimator.input_shape is not None: # pragma: no cover if self.estimator.input_shape[0] != x.shape[1]: raise ValueError("Shape of x does not match input_shape of estimator") if "probabilities" in kwargs.keys(): probabilities = kwargs.get("probabilities") else: probabilities = False if not self._regressor_model: y = check_and_transform_label_format(y, len(np.unique(y)), return_one_hot=True) if y.shape[0] != x.shape[0]: # pragma: no cover raise ValueError("Number of rows in x and y do not match") if self.input_type == "prediction": features = self.estimator.predict(x).astype(np.float32) elif self.input_type == "loss": features = self.estimator.compute_loss(x, y).astype(np.float32).reshape(-1, 1) if self._regressor_model: y = y.astype(np.float32).reshape(-1, 1) if self.default_model and self.attack_model_type == "nn": import torch # lgtm [py/repeated-import] lgtm [py/import-and-import-from] from torch.utils.data import DataLoader # lgtm [py/repeated-import] from art.utils import to_cuda, from_cuda self.attack_model.eval() # type: ignore inferred: Optional[np.ndarray] = None test_set = self._get_attack_dataset(f_1=features, f_2=y) test_loader = DataLoader(test_set, batch_size=self.batch_size, shuffle=False, num_workers=0) for input1, input2, _ in test_loader: input1, input2 = to_cuda(input1), to_cuda(input2) outputs = self.attack_model(input1, input2) # type: ignore if not probabilities: predicted = torch.round(outputs) else: predicted = outputs predicted = from_cuda(predicted) if inferred is None: inferred = predicted.detach().numpy() else: inferred = np.vstack((inferred, predicted.detach().numpy())) if inferred is not None: if not probabilities: inferred_return = np.round(inferred) else: inferred_return = inferred else: # pragma: no cover raise ValueError("No data available.") elif not self.default_model: # assumes the predict method of the supplied model returns probabilities pred = self.attack_model.predict(np.c_[features, y]) # type: ignore if probabilities: inferred_return = pred else: inferred_return = np.round(pred) else: pred = self.attack_model.predict_proba(np.c_[features, y]) # type: ignore if probabilities: inferred_return = pred[:, [1]] else: inferred_return = np.round(pred[:, [1]]) return inferred_return def _get_attack_dataset(self, f_1, f_2, label=None): from torch.utils.data.dataset import Dataset class AttackDataset(Dataset): """ Implementation of a pytorch dataset for membership inference attack. The features are probabilities/logits or losses for the attack training data (`x_1`) along with its true labels (`x_2`). The labels (`y`) are a boolean representing whether this is a member. """ def __init__(self, x_1, x_2, y=None): import torch # lgtm [py/repeated-import] lgtm [py/import-and-import-from] self.x_1 = torch.from_numpy(x_1.astype(np.float64)).type(torch.FloatTensor) self.x_2 = torch.from_numpy(x_2.astype(np.int32)).type(torch.FloatTensor) if y is not None: self.y = torch.from_numpy(y.astype(np.int8)).type(torch.FloatTensor) else: self.y = torch.zeros(x_1.shape[0]) def __len__(self): return len(self.x_1) def __getitem__(self, idx): if idx >= len(self.x_1): # pragma: no cover raise IndexError("Invalid Index") return self.x_1[idx], self.x_2[idx], self.y[idx] return AttackDataset(x_1=f_1, x_2=f_2, y=label) def _check_params(self) -> None: if self.input_type not in ["prediction", "loss"]: raise ValueError("Illegal value for parameter `input_type`.") if self._regressor_model: if self.input_type != "loss": raise ValueError("Illegal value for parameter `input_type` when estimator is a regressor.") if self.attack_model_type not in ["nn", "rf", "gb"]: raise ValueError("Illegal value for parameter `attack_model_type`.") if self.attack_model: if ClassifierMixin not in type(self.attack_model).__mro__: raise TypeError("Attack model must be of type Classifier.")
model_roc_auc = roc_auc_score(y_test, model.predict(X_test)) fprB_1, tprB_1, thresholdsB_1 = roc_curve(y_test, model.predict_proba(X_test)[:,1]) # Modeling Gradient Boosting Algorithm GBbaseline = GradientBoostingClassifier() GBbaseline.fit(X_train,y_train) y_pred_GB = GBbaseline.predict(X_test) predictions_GB = [round(value) for value in y_pred_GB] accuracy_GB = accuracy_score(y_test, predictions_GB) print(accuracy_GB) ### ROC Curve for Gradient Boosting Algorithm GBbaseline_roc_auc = roc_auc_score(y_test, GBbaseline.predict(X_test)) fpr1_1, tpr1_1, thresholds1_1 = roc_curve(y_test, GBbaseline.predict_proba(X_test)[:,1]) ### Logistic Regression logreg = LogisticRegression() logreg.fit(X_train, y_train) y_pred_log = logreg.predict(X_test) from sklearn.metrics import confusion_matrix confusion_matrix = confusion_matrix(y_test, y_pred_log) print(confusion_matrix) accuracy_LOG = accuracy_score(y_test, y_pred_log) from sklearn.metrics import classification_report print(classification_report(y_test, y_pred_log))
#clf4 = svm.SVC(kernel='poly', probability=True, C=1.0) #clf4.fit(X_train, y_train) #Broken #from sklearn.linear_model import LogisticRegression #rand = RandomTreesEmbedding(n_jobs=-1, n_estimators=1000, min_samples_split=1) #clf3 = rt_lm = LogisticRegression() #pipeline = make_pipeline(rand, clf3) #pipeline.fit(X_train, y_train) #clf5 = svm.SVC(kernel='rbf', probability=True, C=1.0) #clf5.fit(X_train, y_train) #.560 predictions1 = clf1.predict_proba(X_test)[:, 1] sample['WnvPresent'] = predictions1 sample.to_csv('Visualization1.csv', index=False) predictions2 = clf2.predict_proba(X_test)[:, 1] sample['WnvPresent'] = predictions2 sample.to_csv('Visualization2.csv', index=False) predictions3 = clf3.predict_proba(X_test)[:, 1] sample['WnvPresent'] = predictions3 sample.to_csv('Visualization3.csv', index=False) predictions4 = clf4.predict_proba(X_test)[:, 1] sample['WnvPresent'] = predictions4 sample.to_csv('Visualization4.csv', index=False)
# Make the testing rf_data_answer = [] # Test each array with the validation set and use it train the next # classifier pred_proba = crf_t2w.predict_proba(t2w_testing_data) pos_class_arg = np.ravel(np.argwhere(crf_t2w.classes_ == 1))[0] rf_data_answer.append(pred_proba[:, pos_class_arg]) pred_proba = crf_adc.predict_proba(adc_testing_data) pos_class_arg = np.ravel(np.argwhere(crf_adc.classes_ == 1))[0] rf_data_answer.append(pred_proba[:, pos_class_arg]) # pred_proba = crf_mrsi.predict_proba(mrsi_testing_data) # pos_class_arg = np.ravel(np.argwhere(crf_mrsi.classes_ == 1))[0] # rf_data_answer.append(pred_proba[:, pos_class_arg]) pred_proba = crf_dce.predict_proba(dce_testing_data) pos_class_arg = np.ravel(np.argwhere(crf_dce.classes_ == 1))[0] rf_data_answer.append(pred_proba[:, pos_class_arg]) # For know we will train a classifier using the previous probability # extracted rf_data_answer = np.vstack(rf_data_answer).T pred_prob = cgb.predict_proba(rf_data_answer) result_cv.append([pred_prob, cgb.classes_]) # Save the information path_store = '/data/prostate/results/mp-mri-prostate/exp-5/stacking' if not os.path.exists(path_store): os.makedirs(path_store) joblib.dump(result_cv, os.path.join(path_store, 'results.pkl'))
print 'Output : ', outfile # evaluate training results if args.evaluate: util.plot_clf_results_sklearn(bdt, x_train, y_train, w_train, x_test, y_test, w_test, figname=args.outdir + "bdtoutput.png", verbose=(not args.quiet)) util.print_variables_rank(bdt, var, outname=args.outdir + 'ranks.txt', verbose=(not args.quiet)) #y_pred = bdt.decision_function(x_test)#.ravel() y_pred_test = bdt.predict_proba(x_test)[:, 1] y_pred_train = bdt.predict_proba(x_train)[:, 1] #util.plot_roc((y_test, y_pred, w_test), figname=args.outdir+'roc.png', # verbose=(not args.quiet)) datalist = [(y_train, y_pred_train, w_train, 'train'), (y_test, y_pred_test, w_test, 'test')] util.plot_rocs(datalist, figname=args.outdir + 'roc.png', verbose=(not args.quiet), title='')
data = pandas.concat([sig, bkg]) train, test = train_test_split(data, test_size=0.33, random_state=42) clf = GradientBoostingClassifier(learning_rate=0.01, n_estimators=1000, subsample=0.8, random_state=13, max_features=len(features), verbose=1, min_samples_leaf=int(0.01 * len(train)), max_depth=5) clf.fit(train[features], train.target) joblib.dump(clf, 'classifier.pkl', compress=True) pred = clf.predict_proba(test[features])[:, 1] bdt = pred.copy() import itertools xy = [ i * j for i, j in itertools.product([10.**i for i in range(-8, 0)], [1, 2, 4, 8]) ] + [1] plt.plot(xy, xy, color='grey', linestyle='--') plt.xlim([10**-5, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') #draw baseline point
x_train, x_test, y_train, y_test = train_test_split(df_cp, train_Y, test_size=0.25, random_state=4) ########## model start from sklearn.ensemble import GradientBoostingClassifier gdbt = GradientBoostingClassifier(learning_rate=0.01) # 訓練模型 gdbt.fit(x_train, y_train) # 預測測試集 y_pred = gdbt.predict(x_test) y_pred_proba = gdbt.predict_proba(x_test)[:, 1] ########## model end ########## 糢型憑估 start from sklearn import datasets, metrics from sklearn.metrics import confusion_matrix from sklearn.metrics import mean_squared_error, r2_score, accuracy_score check_view = pd.DataFrame({'pred_poi': y_pred_proba, 'poi': y_test}) check_view = check_view.sort_values(by=['pred_poi']) acc = accuracy_score(y_test, y_pred) print("Accuracy: ", acc) var_confusion_matrix = confusion_matrix(y_test, y_pred,
subsample=0.6, max_depth=4, learning_rate=0.04, max_features=50, ) model_rf.fit(X, y) model_extraTrees.fit(X, y) rbf_model.fit(X, y) linear_model.fit(X, y) boost_model.fit(X, y) #get predictions from the machines for the data! extrees_prob = model_extraTrees.predict_proba(org1) rftrees_prob = model_rf.predict_proba(org1) grb_prob = boost_model.predict_proba(org1) rbf_prob = rbf_model.predict_proba(org1) preds_linear = linear_model.predict_proba(org1) grb_prob = grb_prob[:, 1] # Get only the ones'column probabilities rbf_prob = rbf_prob[:, 1] preds_linear = preds_linear[:, 1] extrees_prob = extrees_prob[:, 1] rftrees_prob = rftrees_prob[:, 1] threshhold = 0.9 #hold the index locations of the high scoring samples: # indexs_bestPreds_GRB = np.where(grb_prob>threshhold) indexs_bestPreds_RF = get_threshholdLocs(rftrees_prob, 0.75) indexs_bestPreds_GRB = get_threshholdLocs(grb_prob, threshhold) indexs_bestPreds_RBF = get_threshholdLocs(rbf_prob, 0.6)
def train_gb(): gb = GradientBoostingClassifier(n_estimators=100) gb.fit(train_features, train_labels) probs = gb.predict_proba(test_features)[:,1] save_submission(outfile+"_gb", ids, probs) print "created submission for gb" print cross_val_score(gb, train_features, train_labels, scoring="log_loss")