def train_classifiers(X_data, y_data): ############ Linear SVM: 0.908 ############# clf_LSVM = svm.SVC(kernel = 'linear') clf_LSVM.fit(X_data, y_data) ############ MultinomialNB: 0.875 ############# clf_MNB = MultinomialNB() clf_MNB.fit(X_data, y_data) ############ Random Forest: 0.910 ############# clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy') clf_RF.fit(X_data, y_data) ############ Extra Tree: 0.915 ################## clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0) clf_ETC.fit(X_data, y_data) ############ AdaBoost: 0.88 ################## clf_Ada = AdaBoostClassifier() clf_Ada.fit(X_data, y_data) ############ rbf SVM: 0.895 ############# clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf') clf_rbf.fit(X_data, y_data) ############ GradientBoosting: 0.88 ############# clf_GBC = GradientBoostingClassifier() clf_GBC.fit(X_data, y_data) return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC
def test_plot_partial_dependence_multiclass(pyplot): # Test partial dependence plot function on multi-class input. iris = load_iris() clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) grid_resolution = 25 plot_partial_dependence(clf, iris.data, [0, 1], target=0, grid_resolution=grid_resolution) fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 2 assert all(ax.has_data for ax in axs) # now with symbol labels target = iris.target_names[iris.target] clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, target) grid_resolution = 25 plot_partial_dependence(clf, iris.data, [0, 1], target='setosa', grid_resolution=grid_resolution) fig = pyplot.gcf() axs = fig.get_axes() assert len(axs) == 2 assert all(ax.has_data for ax in axs)
def test_check_inputs_predict(): # X has wrong shape clf = GradientBoostingClassifier(n_estimators=100, random_state=1) clf.fit(X, y) x = np.array([1.0, 2.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) x = np.array([[]]) assert_raises(ValueError, clf.predict, x) x = np.array([1.0, 2.0, 3.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) clf = GradientBoostingRegressor(n_estimators=100, random_state=1) clf.fit(X, rng.rand(len(X))) x = np.array([1.0, 2.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x) x = np.array([[]]) assert_raises(ValueError, clf.predict, x) x = np.array([1.0, 2.0, 3.0])[:, np.newaxis] assert_raises(ValueError, clf.predict, x)
def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000): TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length) prediction_model = GradientBoostingClassifier( loss='deviance', learning_rate=0.1, n_estimators=30, subsample=1.0, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=5, ) x_train, y_train = clean_data(TRAIN_FILE) x_test, y_test = clean_data(TEST_FILE) with Timer('fit model'): prediction_model.fit(x_train, y_train) with Timer('evaluate model'): y_prediction_train = prediction_model.predict_proba(x_train) y_prediction_test = prediction_model.predict_proba(x_test) loss_train = log_loss(y_train, y_prediction_train) loss_test = log_loss(y_test, y_prediction_test) print 'loss_train: %s' % loss_train print 'loss_test: %s' % loss_test
def test_staged_predict_proba(): # Test whether staged predict proba eventually gives # the same prediction. X, y = datasets.make_hastie_10_2(n_samples=1200, random_state=1) X_train, y_train = X[:200], y[:200] X_test, y_test = X[200:], y[200:] clf = GradientBoostingClassifier(n_estimators=20) # test raise NotFittedError if not fitted assert_raises(NotFittedError, lambda X: np.fromiter( clf.staged_predict_proba(X), dtype=np.float64), X_test) clf.fit(X_train, y_train) # test if prediction for last stage equals ``predict`` for y_pred in clf.staged_predict(X_test): assert_equal(y_test.shape, y_pred.shape) assert_array_equal(clf.predict(X_test), y_pred) # test if prediction for last stage equals ``predict_proba`` for staged_proba in clf.staged_predict_proba(X_test): assert_equal(y_test.shape[0], staged_proba.shape[0]) assert_equal(2, staged_proba.shape[1]) assert_array_almost_equal(clf.predict_proba(X_test), staged_proba)
def test_gradient_boosting_early_stopping(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10, learning_rate=0.1, max_depth=3, random_state=42) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if early_stopping works as expected for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13), (gbc, 1e-3, 36), (gbr, 1e-3, 28)): est.set_params(tol=tol) est.fit(X_train, y_train) assert_equal(est.n_estimators_, early_stop_n_estimators) assert est.score(X_test, y_test) > 0.7 # Without early stopping gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42) gbc.fit(X, y) gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=42) gbr.fit(X, y) assert gbc.n_estimators_ == 100 assert gbr.n_estimators_ == 200
def test_gradient_boosting_validation_fraction(): X, y = make_classification(n_samples=1000, random_state=0) gbc = GradientBoostingClassifier(n_estimators=100, n_iter_no_change=10, validation_fraction=0.1, learning_rate=0.1, max_depth=3, random_state=42) gbc2 = clone(gbc).set_params(validation_fraction=0.3) gbc3 = clone(gbc).set_params(n_iter_no_change=20) gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10, learning_rate=0.1, max_depth=3, validation_fraction=0.1, random_state=42) gbr2 = clone(gbr).set_params(validation_fraction=0.3) gbr3 = clone(gbr).set_params(n_iter_no_change=20) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) # Check if validation_fraction has an effect gbc.fit(X_train, y_train) gbc2.fit(X_train, y_train) assert gbc.n_estimators_ != gbc2.n_estimators_ gbr.fit(X_train, y_train) gbr2.fit(X_train, y_train) assert gbr.n_estimators_ != gbr2.n_estimators_ # Check if n_estimators_ increase monotonically with n_iter_no_change # Set validation gbc3.fit(X_train, y_train) gbr3.fit(X_train, y_train) assert gbr.n_estimators_ < gbr3.n_estimators_ assert gbc.n_estimators_ < gbc3.n_estimators_
def test_plot_partial_dependence_input(): # Test partial dependence plot function input checks. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) # not fitted yet assert_raises(ValueError, plot_partial_dependence, clf, X, [0]) clf.fit(X, y) assert_raises(ValueError, plot_partial_dependence, clf, np.array(X)[:, :0], [0]) # first argument must be an instance of BaseGradientBoosting assert_raises(ValueError, plot_partial_dependence, {}, X, [0]) # must be larger than -1 assert_raises(ValueError, plot_partial_dependence, clf, X, [-1]) # too large feature value assert_raises(ValueError, plot_partial_dependence, clf, X, [100]) # str feature but no feature_names assert_raises(ValueError, plot_partial_dependence, clf, X, ['foobar']) # not valid features value assert_raises(ValueError, plot_partial_dependence, clf, X, [{'foo': 'bar'}])
def test_plot_partial_dependence_multiclass(): # Test partial dependence plot function on multi-class input. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], label=0, grid_resolution=grid_resolution) assert len(axs) == 2 assert all(ax.has_data for ax in axs) # now with symbol labels target = iris.target_names[iris.target] clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, target) grid_resolution = 25 fig, axs = plot_partial_dependence(clf, iris.data, [0, 1], label='setosa', grid_resolution=grid_resolution) assert len(axs) == 2 assert all(ax.has_data for ax in axs) # label not in gbrt.classes_ assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], label='foobar', grid_resolution=grid_resolution) # label not provided assert_raises(ValueError, plot_partial_dependence, clf, iris.data, [0, 1], grid_resolution=grid_resolution)
def test_classification_synthetic(): # Test GradientBoostingClassifier on synthetic dataset used by # Hastie et al. in ESLII Example 12.7. X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] for loss in ('deviance', 'exponential'): gbrt = GradientBoostingClassifier(n_estimators=100, min_samples_split=1, max_depth=1, loss=loss, learning_rate=1.0, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.09, \ "GB(loss={}) failed with error {}".format(loss, error_rate) gbrt = GradientBoostingClassifier(n_estimators=200, min_samples_split=1, max_depth=1, learning_rate=1.0, subsample=0.5, random_state=0) gbrt.fit(X_train, y_train) error_rate = (1.0 - gbrt.score(X_test, y_test)) assert error_rate < 0.08, ("Stochastic GradientBoostingClassifier(loss={}) " "failed with error {}".format(loss, error_rate))
def test_partial_dependecy_input(): # Test input validation of partial dependence. clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) assert_raises(ValueError, partial_dependence, clf, [0], grid=None, X=None) assert_raises(ValueError, partial_dependence, clf, [0], grid=[0, 1], X=X) # first argument must be an instance of BaseGradientBoosting assert_raises(ValueError, partial_dependence, {}, [0], X=X) # Gradient boosting estimator must be fit assert_raises(ValueError, partial_dependence, GradientBoostingClassifier(), [0], X=X) assert_raises(ValueError, partial_dependence, clf, [-1], X=X) assert_raises(ValueError, partial_dependence, clf, [100], X=X) # wrong ndim for grid grid = np.random.rand(10, 2, 1) assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
def train_GBDT(self): samples=self.trainset.values target=self.trainlabel.values classifier_GB=GradientBoostingClassifier(n_estimators=1000) classifier_GB.fit(samples,target) return classifier_GB
def model_train_ensemble(X1,Y1,Save = False, modelname = None): X1,Y1 = DowmSample(X1,Y1,9) # model = RandomForestClassifier(n_estimators=100,random_state=1) model = GradientBoostingClassifier(n_estimators=100,max_leaf_nodes=5, subsample=0.7, learning_rate=0.1, random_state=1) # model = LogisticRegression('l2') model.fit(X1, Y1.ravel()) # 保存模型 if Save == True: f = open(modelname,'w') pickle.dump(model, f) f.close() print '\n -------------- Training is over ----------------------' return model
def main(): makeSub = True featureImportance = False cvfold = True df = pd.read_csv('../data/cprobTrain15NA.csv') X, y = np.array(pd.read_csv('../data/train.csv',usecols=range(1,9))), np.array(pd.read_csv('../data/train.csv').ACTION) X = np.hstack((X,np.array(df))) params = {'max_depth':4, 'subsample':0.5, 'verbose':0, 'random_state':1337, 'min_samples_split':10, 'min_samples_leaf':10, 'max_features':10, 'n_estimators': 350, 'learning_rate': 0.05} clf = GradientBoostingClassifier(**params) prefix = 'lib/gbm350d4m10c15' if cvfold: c = classifier.Classifier(X,y) c.validate(clf,nFolds=10,out=prefix+'Train.csv') if makeSub: Xt = np.array(pd.read_csv('../data/test.csv',usecols=range(1,9))) Xt = np.hstack((Xt,np.array(pd.read_csv('../data/cprobTest15NA.csv')))) clf.fit(X,y) y_ = clf.predict_proba(Xt)[:,1] out = pd.read_csv('subs/nbBaseTest.csv') out.ACTION = y_ out.to_csv(prefix+'Test.csv',index=False) if featureImportance: print "Feature ranking:" importances = clf.feature_importances_ indices = np.argsort(importances)[::-1] np.savetxt('indices.txt',indices,delimiter=',') for f in xrange(df.shape[1]): print "%d. feature (%s,%f)" % (f + 1, df.columns[indices[f]], importances[indices[f]])
def PlotFeaturesImportance(X,y,featureNames,dataName): ''' Plot the relative contribution/importance of the features. Best to reduce to top X features first - for interpretability Code example from: http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/ ''' gbc = GradientBoostingClassifier(n_estimators=40) gbc.fit(X, y) # Get Feature Importance from the classifier feature_importance = gbc.feature_importances_ # Normalize The Features feature_importance = 100 * (feature_importance / feature_importance.max()) sorted_idx = numpy.argsort(feature_importance) pos = numpy.arange(sorted_idx.shape[0]) + 4.5 # pos = numpy.arange(sorted_idx.shape[0]) # plt.figure(figsize=(16, 12)) plt.figure(figsize=(14, 9), dpi=250) plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6') #plt.yticks(pos, numpy.asanyarray(df.columns.tolist())[sorted_idx]) #ORIG plt.yticks(pos, numpy.asanyarray(featureNames)[sorted_idx]) plt.xlabel('Relative Importance') plt.title('%s: Top Features' %(dataName)) plt.grid('off') plt.ion() plt.show() plt.savefig(str(dataName)+'TopFeatures.png',dpi=200)
def train(): posi_result = {} train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str) tmp1 = [m < 32 for m in trainTarList] tmp1 = np.array(tmp1) # train_feature = train_feature[tmp1] target_list = np.array(trainTarList) target_list = target_list[tmp1] # train_id_list = np.array(train_id_list) # train_id_list = train_id_list[tmp1] c_feature = trainFeature.columns[:] clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17) clf1.fit(trainFeature[c_feature], target_list) # rf_preds = clf1.predict(test_feature) rf_prob = clf1.predict_proba(test_feature) gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17) gbdt1.fit(trainFeature[c_feature], target_list) # gbdt_preds = gbdt1.predict(test_feature) gbdt_prob = gbdt1.predict_proba(test_feature) all_prob = rf_prob + gbdt_prob all_preds = [] print all_prob.shape for k in range(all_prob.shape[0]): prob1 = list(allProb[k, :]) ind1 = prob.index(max(prob1)) allPreds.append(ind1) for j in range(len(all_preds)): all_pre_name = dl.get_num_position(all_preds[j]) posi_result[test_id_list[j]] = all_pre_name return posi_result
def test_max_feature_auto(): """Test if max features is set properly for floats and str. """ X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1) _, n_features = X.shape X_train, X_test = X[:2000], X[2000:] y_train, y_test = y[:2000], y[2000:] gbrt = GradientBoostingClassifier(n_estimators=1, max_features='auto') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.sqrt(n_features))) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='auto') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, n_features) gbrt = GradientBoostingRegressor(n_estimators=1, max_features=0.3) gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(n_features * 0.3)) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='sqrt') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.sqrt(n_features))) gbrt = GradientBoostingRegressor(n_estimators=1, max_features='log2') gbrt.fit(X_train, y_train) assert_equal(gbrt.max_features_, int(np.log2(n_features)))
def gbc_gp_predict(train_x, train_y, test_x): feature_indexs = getTopFeatures(train_x, train_y) sub_x_Train = get_data( train_x, feature_indexs[:16], features.feature_pair_sub_list, features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20], ) sub_x_Test = get_data( test_x, feature_indexs[:16], features.feature_pair_sub_list, features.feature_pair_plus_list, features.feature_pair_mul_list, features.feature_pair_divide_list[:20], ) labels = toLabels(train_y) gbc = GradientBoostingClassifier(n_estimators=3000, max_depth=9) gbc.fit(sub_x_Train, labels) pred_probs = gbc.predict_proba(sub_x_Test)[:, 1] ind_test = np.where(pred_probs > 0.55)[0] gp_preds_part = gbc_gp_predict_part(sub_x_Train, train_y, sub_x_Test[ind_test]) gp_preds = np.zeros(len(test_x)) gp_preds[ind_test] = gp_preds_part return gp_preds
def ada_boost(): savefile = open('traindata.pkl', 'rb') (x_train, y_train, t1) = cPickle.load(savefile) savefile.close() savefile = open('testdata.pkl', 'rb') (x_test, t1, name1) = cPickle.load(savefile) savefile.close() # X_train, X_valid, y_train, y_valid = cross_validation.train_test_split( # X, y, test_size=0.1, random_state=42) x_train = np.asarray(x_train,dtype=np.float32) y_train = np.asarray(y_train, dtype='int32')-1 nest = 190 lr = .1 md = 6 # clf1 = DecisionTreeClassifier(max_depth=2) # clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25) clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0) # clf = RandomForestClassifier(n_estimators=200) #.81 # clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81 # clf = KNeighborsClassifier(15) if 1: clf.fit(x_train, y_train) ypred = clf.predict_proba(x_test) y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9'] kcsv.print_csv(ypred, name1, y_str,indexname='id') print (nest, lr, md) if 0: multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True) scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss) print scores print (nest, lr, md, scores.mean())
def GradBoost(X_DS, Y_DS, X_train, X_test, y_train, y_test, Cl_Names = 'None', mask='None',Max_Depth=3): #****************************************************************************** from sklearn.ensemble import GradientBoostingClassifier as GBC #import library for machine learning analysis from sklearn.metrics import classification_report print 'Gradient Boosting: Training...' #notify the user about the status of the process Gradient_Boosting_obj = GBC(max_depth=Max_Depth) #call the Gradient Boosting routine built in Gradient_Boosting_obj.fit(X_train, y_train) #fit the logistic model to the train data sets Pred_Train = Gradient_Boosting_obj.predict(X_train) #apply the logistic model to the train dataset Pred_Test = Gradient_Boosting_obj.predict(X_test) #apply the logistic model to the test dataset print 'Gradient Boosting: Completed!' #notify the user about the status of the process labels = len(np.unique(Y_DS)) #extract the labels from the classification classes Conf_M = np.zeros((labels,labels), dtype='int') #initialize the confusion matrix for the classification problem if Cl_Names != 'None': target_names = Cl_Names else: target_names = np.arange(len(np.unique(Y_DS))).astype(str).tolist() #end Conf_M = CM(y_test, Pred_Test,np.unique(Y_DS)) #calls the confusion matrix routine with the test set and prediction set print(classification_report(y_test, Pred_Test, target_names=target_names)) #print the performance indicators on the console return Gradient_Boosting_obj, Conf_M
def get_n_fold_validation_score(self, fold=10): features = data.get_features() lables = data.get_lables() length = len(features) jump = length / fold index = 0 k = 0 scores = list() while k < fold: feature_test = features.iloc[index : (index + jump), :] lable_test = lables.iloc[index : (index + jump), :] feature_train_1, feature_train_2 = ( features.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(), features.iloc[index + jump + 1 : length - 1], ) feature_train = pd.concat([feature_train_1, feature_train_2]) lable_train_1, lable_train_2 = ( lables.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(), lables.iloc[index + jump + 1 : length - 1], ) lable_train = pd.concat([lable_train_1, lable_train_2]) index += jump k += 1 classifier = GradientBoostingClassifier() classifier.fit(feature_train, lable_train["lable"].values) scores.append(accuracy_score(lable_test, classifier.predict(feature_test))) return sum(scores) / float(len(scores))
def gbdt_train(self, data, task_id, window=DEFAULT_WINDOW): """ Train a gbdt model. :param data: Training dataset. :param task_id: The id of the training task. :param window: the length of window """ X_train = [] y_train = [] features = self.__calculate_features(data, window) if features: return TSD_LACK_SAMPLE for index in features: X_train.append(index[0]) y_train.append(index[1]) X_train = np.array(X_train) y_train = np.array(y_train) try: grd = GradientBoostingClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, learning_rate=self.learning_rate) grd.fit(X_train, y_train) model_name = MODEL_PATH + task_id + "_model" joblib.dump(grd, model_name) except Exception as ex: return TSD_TRAIN_ERR, str(ex) return TSD_OP_SUCCESS, ""
class Blender(BaseEstimator, ClassifierMixin): def __init__(self, trained_clfs): self.clfs = trained_clfs # self.classifier = make_pipeline(OneHotEncoder(), DenseTransformer(), # GradientBoostingClassifier()) self.classifier = GradientBoostingClassifier() # self.classifier = make_pipeline( # OneHotEncoder(), LogisticRegression(class_weight='auto')) def fit(self, data, target): # self.enc = LabelEncoder().fit(target) probs = self.transform_input(data) # self.classifier.fit(predictions, target) self.classifier.fit(probs, target) def predict(self, data): predictions = self.transform_input(data) return self.classifier.predict(predictions) def transform_input(self, data): probabilities = [clf.predict_proba(data) for clf in self.clfs] probabilities = np.array(probabilities) # features, samples = probabilities.shape n_clfs, samples, features = probabilities.shape probabilities = np.reshape(probabilities, (samples, n_clfs * features)) probabilities[np.isnan(probabilities)] = 0 return probabilities
def main(): print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S')) testing_file = file('test.p', 'r') training_file = file('train.p', 'r') train = pickle.load(training_file) test = pickle.load(testing_file) testing_file.close() training_file.close() trainX = train[:,:-1] trainy = train[:,-1] testX = test[:,:-1] testy = test[:,-1] print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'GradientBoostingClassifier(n_estimators=1000)') clf = GradientBoostingClassifier(n_estimators=1000) clf.fit(trainX, trainy) print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S')) prediction = clf.predict(testX) print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction)) model_save_file = file('gradient_1000.p', 'w') pickle.dump(clf, model_save_file) model_save_file.close() print 'All done'
def partial_dependence(df, y): ''' INPUT: X = features y = target variable binary, imbalanced classes OUPUT: X = features oversampled to have balanced target classes y = target variable oversample to have balanced classes Discovers the minority class and then oversamples until eah class makes up 50% of your data. ''' X_train, X_test, y_train, y_test = oversample_train_test(df, y) # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42) feature_engineering = Pipeline([ ('lists', ListSplitter()), ('race', RaceDummies()), ('crime_sentence', CrimeAndSentence()), ('feat_eng', FeatureEngineer()), ('columns', ColumnFilter(prejudice=False)) ]) X = feature_engineering.fit_transform(X_train.copy(), y_train) X_test = feature_engineering.fit_transform(X_test.copy(), y_test) gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75) gbc.fit(X.copy(), y_train) most_imp = np.argsort(gbc.feature_importances_)[-6:] names = list(X_test.columns) feats = list(most_imp) fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names, n_jobs=3, grid_resolution=50)
def predict(fea, df, t, t9): Un = df.columns == 'Blank' for f in Fea: ''' try: df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')] print(1) except: pass ''' Un = Un | (df.columns == f) Un = Un | (df.columns == (f+'_x')) Un = Un | (df.columns == (f+'_y')) Un = Un & (df.columns != 'New_y') clf = GradientBoostingClassifier() y = df[t].label X = df[t].ix[:,Un] X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1) clf.fit(X_train, y_train) re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1])) print re re = 'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1])) print re print(X.columns) print(clf.feature_importances_) return Un, clf
def run_gradient_boosting_classifier(data, _max_depth): (feature_train, feature_test, label_train, label_test) = train_test_split(data[:, 0:-1], data[:, -1].astype(int), test_size=0.25) # TODO: Vary Number of Estimators and Learning Rate gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=_max_depth, verbose = True) gbc.fit(feature_train, label_train) training_error = gbc.score(feature_train, label_train) #cross_validation_score = cross_val_score(gbc, feature_train, label_train, cv=10) testing_error = gbc.score(feature_test, label_test) print "Random Forest Results for Max Depth:", _max_depth print "Training Accuracy:", training_error #print "10-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (cross_validation_score.mean(), cross_validation_score.std() * 2) print "Testing Accuracy:", testing_error feature_importance = gbc.feature_importances_ stddev = np.std([tree[0].feature_importances_ for tree in gbc.estimators_], axis=0) indices = np.argsort(feature_importance)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(len(feature_importance)): print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importance[indices[f]])) plot_feature_importance(feature_importance, indices, stddev, "gradient-boosted-classifier-feature-importance-depth-" + str(_max_depth))
def test_oob_improvement(): """Test if oob improvement has correct shape and regression test. """ clf = GradientBoostingClassifier(n_estimators=100, random_state=1, subsample=0.5) clf.fit(X, y) assert clf.oob_improvement_.shape[0] == 100 # hard-coded regression test - change if modification in OOB computation assert_array_almost_equal(clf.oob_improvement_[:5], np.array([0.19, 0.15, 0.12, -0.12, -0.11]), decimal=2)
def __init__(self, estimator, phase, n_jobs, cv_k_fold, parameters, X_train, y_train, X_test, y_test): # estimator : ensemble学習器 # cv : if train : get best parameter if phase == "train": clf = GradientBoostingClassifier() gscv = GridSearchCV(clf, parameters, verbose = 10, scoring = "f1",#scoring = "precision" or "recall" n_jobs = n_jobs, cv = cv_k_fold) gscv.fit(X_train, y_train) self.best_params = gscv.best_params_ clf.set_params(**gscv.best_params_) clf.fit(X_train, y_train) train_loss = clf.train_score_ test_loss = np.empty(len(clf.estimators_)) for i, pred in enumerate(clf.staged_predict(X_test)): test_loss[i] = clf.loss_(y_test, pred) plt.plot(np.arange(len(clf.estimators_)) + 1, test_loss, label='Test') plt.plot(np.arange(len(clf.estimators_)) + 1, train_loss, label='Train') plt.xlabel('the number of weak learner:Boosting Iterations') plt.ylabel('Loss') plt.legend(loc="best") plt.savefig("loss_cv.png") plt.close() estimator.set_params(**gscv.best_params_) self.estimator = estimator self.one_hot_encoding = None
def mse_sklearn(x_train, x_test, y_train, y_test, n_estimators): clf = GradientBoostingClassifier(n_estimators=n_estimators, min_samples_leaf=MIN_SAMPLES_LEAF, max_depth=MAX_DEPTH) clf.fit(x_train, y_train) pred = clf.predict(x_test) return f1_score(y_test, pred)
def callback(*args): clear_text() algorithm = str() algorithm = str(variable.get()) if (algorithm == "K-Means Classifier"): knn = KNeighborsClassifier(n_neighbors=9) knn.fit(X_train, Y_train) acc_train = str(round(knn.score(X_train, Y_train) * 100, 5)) acc_test = str(round(knn.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + " %", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) for n_neighbors in neighbors_settings: # build the model knn = KNeighborsClassifier(n_neighbors=n_neighbors) knn.fit(X_train, Y_train) # record training set accuracy training_accuracy.append(knn.score(X_train, Y_train)) # record test set accuracy test_accuracy.append(knn.score(X_test, Y_test)) plt.title("K-Means") #plt.figure(figsize=(8,8)) plt.plot(neighbors_settings, training_accuracy, label="training accuracy") #plt.plot(neighbors_settings, test_accuracy, label="test accuracy") plt.ylabel("Accuracy") plt.xlabel("n_neighbors") plt.legend() plt.show() plt.savefig('knn_compare_model') elif (algorithm == "Decision Tree Classifier"): tree = DecisionTreeClassifier() tree.fit(X_train, Y_train) acc_train = str(round(tree.score(X_train, Y_train) * 100, 5)) acc_test = str(round(tree.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(tree) plt.savefig('feature_importance') plt.show() #plt.set_position(self, bottom, which='both') elif (algorithm == "Random Forest Classifier"): rf = RandomForestClassifier() rf.fit(X_train, Y_train) acc_train = str(round(rf.score(X_train, Y_train) * 100, 5)) acc_test = str(round(rf.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(rf) plt.savefig('feature_importance_rf') plt.show() elif (algorithm == "Gradient Boosting Classifier"): #clear_text() gb = GradientBoostingClassifier() gb.fit(X_train, Y_train) acc_train = str(round(gb.score(X_train, Y_train) * 100, 5)) acc_test = str(round(gb.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(gb) plt.savefig('feature_importance_gb') plt.show() elif (algorithm == "SV Classifier"): svc = SVC() svc.fit(X_train, Y_train) acc_train = str(round(svc.score(X_train, Y_train) * 100, 5)) acc_test = str(round(svc.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(svc) plt.savefig('feature_importance_svc') plt.show() elif (algorithm == "Logistics Regression"): logreg = LogisticRegression() logreg.fit(X_train, Y_train) acc_train = str(round(logreg.score(X_train, Y_train) * 100, 5)) acc_test = str(round(logreg.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plt.figure(figsize=(8, 6)) plt.plot(logreg.coef_.T, 'o', label="C=1") plt.xticks(range(diabetes.shape[1]), diabetes_features, rotation=90) plt.hlines(0, 0, diabetes.shape[1]) plt.ylim(-5, 5) plt.xlabel("Feature") plt.ylabel("Coefficient magnitude") plt.legend() plt.savefig('log_coef') plt.show() elif (algorithm == "Logistics Regression(C=150)"): logreg100 = LogisticRegression(C=150).fit(X_train, Y_train) acc_train = str(round(logreg100.score(X_train, Y_train) * 100, 5)) acc_test = str(round(logreg100.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plt.figure(figsize=(8, 6)) plt.plot(logreg100.coef_.T, 'o', label="C=1") plt.xticks(range(diabetes.shape[1]), diabetes_features, rotation=90) plt.hlines(0, 0, diabetes.shape[1]) plt.ylim(-5, 5) plt.xlabel("Feature") plt.ylabel("Coefficient magnitude") plt.legend() plt.savefig('log_coef100') plt.show() elif (algorithm == "Logistics Regression(C=0.01)"): logreg001 = LogisticRegression(C=0.01).fit(X_train, Y_train) acc_train = str(round(logreg001.score(X_train, Y_train) * 100, 5)) acc_test = str(round(logreg001.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plt.figure(figsize=(8, 6)) plt.plot(logreg001.coef_.T, 'o', label="C=1") plt.xticks(range(diabetes.shape[1]), diabetes_features, rotation=90) plt.hlines(0, 0, diabetes.shape[1]) plt.ylim(-5, 5) plt.xlabel("Feature") plt.ylabel("Coefficient magnitude") plt.legend() plt.savefig('log_coef001') plt.show() elif (algorithm == "Decision Tree Classifier(Depth=3)"): \ tree = DecisionTreeClassifier(max_depth=3, random_state=0) tree.fit(X_train, Y_train) acc_train = str(round(tree.score(X_train, Y_train) * 100, 5)) acc_test = str(round(tree.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(tree) plt.savefig('feature_importance') plt.show() elif (algorithm == "Random Forest Classifier(n_estimators=150)"): rf = RandomForestClassifier(n_estimators=150, random_state=0) rf.fit(X_train, Y_train) acc_train = str(round(rf.score(X_train, Y_train) * 100, 5)) acc_test = str(round(rf.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(rf) plt.savefig('feature_importance_rf') plt.show() elif (algorithm == "Gradient Boosting Classifier(Depth=1)"): gb1 = GradientBoostingClassifier(random_state=0, max_depth=1) gb1.fit(X_train, Y_train) acc_train = str(round(gb1.score(X_train, Y_train) * 100, 5)) acc_test = str(round(gb1.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(gb1) plt.savefig('feature_importance_gb1') plt.show() elif (algorithm == "Gradient Boosting Classifier(Learning_rate=0.01)"): gb2 = GradientBoostingClassifier(random_state=0, learning_rate=0.01) gb2.fit(X_train, Y_train) acc_train = str(round(gb2.score(X_train, Y_train) * 100, 5)) acc_test = str(round(gb2.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(gb2) plt.savefig('feature_importance_gb2') plt.show() elif (algorithm == "SV Classifier(Random_state=42)"): svc = SVC(kernel='linear', random_state=42) svc.fit(X_train, Y_train) acc_train = str(round(svc.score(X_train, Y_train) * 100, 5)) acc_test = str(round(svc.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(svc) plt.savefig('feature_importance_svc(42)') plt.show() elif (algorithm == "Min Max Scalar"): scaler = MinMaxScaler() svc = SVC() X_train_scaled = scaler.fit_transform(X_train) * 100 X_test_scaled = scaler.fit_transform(X_test) * 100 svc.fit(X_train_scaled, Y_train) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + str(round(X_train_scaled, 5)) + "%\n\nTest Set: " + str(round(X_test_scaled, 5)) + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(svc) plt.savefig('feature_importance_svc(42)') plt.show() elif (algorithm == "SV Classifier(C=1000)"): svc = SVC(C=1000) scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.fit_transform(X_test) svc.fit(X_train_scaled, Y_train) acc_train = str(round(svc.score(X_train, Y_train) * 100, 5)) acc_test = str(round(svc.score(X_test, Y_test) * 100, 5)) label1 = tk.Label(root, text=algorithm + "\n\nTraining Set: " + acc_train + "%\n\nTest Set: " + acc_test + "%", font=("arial", 13, "bold")) label1.place(x=300, y=180, width=700, height=400) plot_feature_importances_diabetes(svc) plt.savefig('feature_importance_svc(42)') plt.show()
# # Here are partial plots from a very simple model on the Titanic data. # In[ ]: titanic_data = pd.read_csv('../input/titanic/train.csv') titanic_y = titanic_data.Survived clf = GradientBoostingClassifier() titanic_X_colns = ['PassengerId','Age', 'Fare',] titanic_X = titanic_data[titanic_X_colns] my_imputer = Imputer() imputed_titanic_X = my_imputer.fit_transform(titanic_X) clf.fit(imputed_titanic_X, titanic_y) titanic_plots = plot_partial_dependence(clf, features=[1,2], X=imputed_titanic_X, feature_names=titanic_X_colns, grid_resolution=8) # These might seem surprising at first glance. But they show some interesting insights: # * Being young increased your odds of survival. This is consistent with historical recountings that they got women and children off the Titanic first. # * People who paid more had better odds of survival. It turns out that higher fares got you a cabin that was closer to the top of the boat, and may have given you better odds of getting a life-boat. # # # Conclusion # Partial dependence plots are a great way (though not the only way) to extract insights from complex models. These can be incredibly powerful for communicating those insights to colleagues or non-technical users. # # There are a variety of opinions on how to interpret these plots when they come from non-experimental data. Some claim you can conclude nothing about cause-and-effect relationships from data unless it comes from experiments. Others are more positive about what can be learned from non-experimental data (also called observational data). It's a divisive topic in the data science world, beyond the scope of this tutorial. # # However most agree that these are useful to understand your model. Also, given the messiness of most real-world data sources, it's also a good sanity check that your model is capturing realistic patterns. #
if LabelEncoder_mapping is not None: df_val = input_val_data.apply(lambda x: LabelEncoder_mapping[x.name].transform(x)) else: df_val = input_val_data[:] independent_variable_val = df_val[independent_variable_name].values dependent_variable_val = df_val[dependent_variable_name].values ############################################################################################################ ### Build Random Forest - Development ####################### ### Default [add later #oob_score=True, ] #clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(independent_variable, dependent_variable) clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0) clf = clf.fit(independent_variable, dependent_variable) #Selecting good features by Mean decrease impurity. model = GradientBoostingClassifier() scores = selecting_good_features(independent_variable, dependent_variable, independent_variable_name, 100, 0.3) print sorted([(round(np.mean(score), 4), feat) for feat, score in scores.items()], reverse=True) #Running a different number of trees and see the effect of that on the accuracy of the prediction iterate_tree_in_Gradient_Boosting_Classifier(25, independent_variable, dependent_variable) plt.savefig('Number of Trees vs Accuracy.png') plt.show() #Generate a simple plot of training learning curve title = "Learning Curves - Gradient Boosting Classifier" cv = model_selection.ShuffleSplit(n_splits=10, test_size=0.2, random_state=0) #n_splits:30 (Try)
clt_ext.fit(X, y) score_ext = cross_val_score(clt_ext, X, y, cv=5).mean() print(score_ext) # In[ ]: #Gradient Boost import warnings warnings.filterwarnings clf_gb = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=3, subsample=0.5, random_state=0).fit(X, y) clf_gb = clf_gb.fit(X, y) score_gb = cross_val_score(clf_gb, X, y, cv=5).mean() print(score_gb) # In[ ]: #Ada Boost clf_ada = AdaBoostClassifier(n_estimators=400, learning_rate=0.1) clf_ada.fit(X, y) score_ada = cross_val_score(clf_ada, X, y, cv=5).mean() print(score_ada) # In[ ]: #Extreme Gradient Boosting clf_xgb = xgb.XGBClassifier(max_depth=2,
sgd_clf = SGDClassifier() results = cross_val_score(sgd_clf, X_train_smote, y_train_smote, cv=kfold, scoring='accuracy') print("Training Accuracy: %.3f" % (results.mean()*100.0)) sgd_clf.fit(X_train_smote, y_train_smote) predictions = sgd_clf.predict(X_test) print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0)) print('Confusion Matrix:\n',confusion_matrix(y_test, predictions)) print('Classification report\n', classification_report(y_test, predictions)) """###**Gradient Boosting Classifier**""" kfold = KFold(n_splits=10) gb_clf = GradientBoostingClassifier() results = cross_val_score(gb_clf, X_train_smote, y_train_smote, cv=kfold, scoring='accuracy') print("Training Accuracy: %.3f" % (results.mean()*100.0)) gb_clf.fit(X_train_smote, y_train_smote) predictions = gb_clf.predict(X_test) print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0)) print('Confusion Matrix:\n',confusion_matrix(y_test, predictions)) print('Classification report\n', classification_report(y_test, predictions)) """###**Random Forest**""" from sklearn.ensemble import RandomForestClassifier kfold = KFold(n_splits=10) rf_clf = RandomForestClassifier() results = cross_val_score(rf_clf, X_train_smote, y_train_smote, cv=kfold, scoring='accuracy') print("Training Accuracy: %.3f" % (results.mean()*100.0)) rf_clf.fit(X_train_smote, y_train_smote) predictions = rf_clf.predict(X_test) print("Testing Accuracy:%.3f" % (accuracy_score(y_test, predictions)*100.0))
sys.exit(-1) else: dataset = sys.argv[1] store_dir = sys.argv[2] # Create several array for the data if dataset == 'random': X, y, T, valid = misc.generate_samples(N_SAMPLES, N_FEATURES, RND_SEED) else: raise ValueError('The dataset is not known. The possible choices are:' ' random') # Fit the sklearn gradient boosting clf = GradientBoostingClassifier() clf.set_params(**params_sklearn) clf.fit(X, y) # Fit the xgboost gradient boosting xgb_training = xgb.DMatrix( X, label=y, missing=None, weight=None, silent=False, feature_names=None, feature_types=None) n_est = params_xgboost.pop('n_estimators') bst = xgb.train(params_xgboost, xgb_training, n_est) # Fit the LightGBM gradient boosting max_bin = params_lgbm.pop('max_bin')
# 一、使用单一决策树进行模型训练和预测分析 from sklearn.tree import DecisionTreeClassifier dtc = DecisionTreeClassifier() #使用默认配置初始化决策树分类器 dtc.fit(X_train, y_train) #使用分割得到的训练数据进行模型学习 dtc_y_predict = dtc.predict(X_test) #使用训练好的决策树模型对测试特征数据进行预测 # 二、使用随机森林分类器进行集成模型的训练以及预测分析 from sklearn.ensemble import RandomForestClassifier rfc = RandomForestClassifier() rfc.fit(X_train, y_train) rfc_y_pred = rfc.predict(X_test) # 三、使用梯度提升决策树进行集成模型的训练以及预测分析 from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier() gbc.fit(X_train, y_train) gbc_y_pred = gbc.predict(X_test) #集成模型对泰坦尼克号乘客是否生还的预测性能 from sklearn.metrics import classification_report #输出单一决策树在测试集上的分类准确性,以及精确率、召回率、F1指标 print('单一决策树的准确率是:', dtc.score(X_test, y_test)) print(classification_report(dtc_y_predict, y_test)) #输出随机森林分类器在测试集上的分类准确性,以及精确率、召回率、F1指标 print("随机分类器的准确率是:", rfc.score(X_test, y_test)) print(classification_report(rfc_y_pred, y_test)) #输出梯度提升决策树在测试集上的分类准确性,以及精确率、召回率、F1指标 print('梯度提升决策树的准确性是:', gbc.score(X_test, y_test))
] modelnum = 0 for elo_name, elo_df in train_df.groupby(train_df['elo_groups']): msg('working on elo group %s, of size %i' % (elo_name, elo_df.shape[0])) msg('computing perfect-move model') gbc = GradientBoostingClassifier(min_samples_split=500, min_samples_leaf=300, n_estimators=NUM_ESTIMATORS, verbose=1, subsample=0.5, learning_rate=0.2) X = elo_df[features] y = (elo_df['clipped_movergain'] == 0) gbc.fit(X, y) joblib.dump([elo_name, 1.0, gbc], '%s%i.p' % (blundermodel_dir, modelnum)) modelnum = modelnum + 1 for mg_quant in mg_quants: msg('computing mg_quant %f' % mg_quant) gbr = GradientBoostingRegressor(loss='quantile', alpha=mg_quant, min_samples_split=500, min_samples_leaf=300, n_estimators=NUM_ESTIMATORS, verbose=1, subsample=0.5, learning_rate=0.2) imperfect_df = elo_df[elo_df['clipped_movergain'] < 0] X = imperfect_df[features]
def gradBoost(X, y): from sklearn.ensemble import GradientBoostingClassifier gradientBoosting = GradientBoostingClassifier() gradientBoosting.fit(X, y) return (gradientBoosting)
# List of comments comments = [] # https://stackoverflow.com/questions/49100615/nltk-detecting-whether-a-sentence-is-interogative-or-not nltk.download('nps_chat') posts = nltk.corpus.nps_chat.xml_posts() posts_text = [post.text for post in posts] #divide train and test in 80 20 train_text = posts_text[:int(len(posts_text) * 0.8)] test_text = posts_text[int(len(posts_text) * 0.2):] #Get TFIDF features vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=0.001, max_df=0.7, analyzer='word') X_train = vectorizer.fit_transform(train_text) X_test = vectorizer.transform(test_text) y = [post.get('class') for post in posts] y_train = y[:int(len(posts_text) * 0.8)] y_test = y[int(len(posts_text) * 0.2):] gb = GradientBoostingClassifier(n_estimators=400, random_state=0) gb.fit(X_train, y_train) question_comments = [] for comment in comments: type_of_comment = gb.predict(vectorizer.transform([comment])) if (type_of_comment == 'ynQuestion' or type_of_comment == 'whQuestion' or '?' in comment): question_comments.append(comment) question_comments
'U_behaviors_sum10', 'Item_sale10', 'Item_sale5', 'Item_sale3', 'Item_sale1', 'car5', 'car4', 'car3', 'car2', 'car1', 'buy5', 'buy4', 'buy3', 'buy2', 'buy1', 'I_order10', 'I_order5', 'I_order3', 'I_order1', 'I_buyer10', 'I_buyer5', 'I_buyer3', 'I_buyer1', 'behav1', 'behav2', 'behav3', 'behav4', 'last_time' ] df_train = pd.read_csv("train_feature.csv") df_validation = pd.read_csv("validation_feature.csv") #数据归一化处理 ui = df_train[["user_id", "item_id"]] samples = df_train[features] target = df_train["tag"] classifier = GradientBoostingClassifier(n_estimators=200, learning_rate=1.0, max_depth=5, random_state=0) classifier.fit(samples, target) # 训练数据来学习,不需要返回值 validation_feature = df_validation[features] x = classifier.predict(validation_feature) # 测试数据,分类返回标记 print x validation_ui = df_validation[["user_id", "item_id"]] validation_ui["tag"] = x validation_result = validation_ui[validation_ui.tag == 1][[ "user_id", "item_id" ]] os.chdir('..') validation_result.to_csv("predict_v_Gbrt.csv", index=False)
from time import time fold1_df = load_dataframe(filename='fold1_NA_features.dat') fold2_df = load_dataframe(filename='fold2_NA_features.dat') del fold1_df['id'] del fold2_df['id'] n_features = int(len(fold1_df.columns) / 4) p0 = time() clf = GradientBoostingClassifier('deviance', learning_rate=0.05, n_estimators=100, max_features=n_features) clf.fit(fold1_df.iloc[:, 1:], fold1_df.iloc[:, 0]) preds_ens = clf.predict_proba(fold2_df.iloc[:, 1:])[:, 1] print(time() - p0) ## Ensemble the predictions true_values = fold2_df['label'] df, best_index = f1_scores_plot(preds_ens, true_values) df['f1_score'][best_index] #Li ### Check perfomance on fold3 fold3_df = load_dataframe(filename='fold3_NA_features.dat') del fold3_df['id'] dw_cols = [x for x in fold1_df.columns if x[-2:] == 'dw' and x[:3] == 'pca'] fold3_df[dw_cols] = np.log(np.array(fold3_df[dw_cols])) fold3_df = fold3_df.replace([-np.inf], 0) x_test = fold3_df[[x for x in fold3_df.columns if x != 'label']]
class qbc_dcw: def __init__(self,distance_scope=2,each_size=30): assert balance_scale>=0 and balance_scale<=1,"balance_scale must in [0,1]" assert bigger_par>=0,"bigger_par must be positive" assert distance_scope>=2,"distance_scope must greater than 2" assert each_size>0,"each_size must be positive" self.distance_scope = distance_scope self.each_size = each_size self._X_train = None self._y_train = None def fit(self,X_train,y_train): assert X_train.shape[0]==y_train.shape[0],"the size of X_train must be equal to the size of y_train" self._X_train = X_train self._y_train = y_train choice_list = [i for i in range(len(self._X_train))] random.shuffle(choice_list) num1 = int(len(self._X_train)/3) num2 = int(len(self._X_train)*2/3) x_init1 = pd.concat([self._X_train.iloc[choice_list[:num1]]]) x_init2 = pd.concat([self._X_train.iloc[choice_list[num1:num2]]]) x_init3 = pd.concat([self._X_train.iloc[choice_list[num2:]]]) y_init1 = pd.concat([self._y_train.iloc[choice_list[:num1]]]) y_init2 = pd.concat([self._y_train.iloc[choice_list[num1:num2]]]) y_init3 = pd.concat([self._y_train.iloc[choice_list[num2:]]]) self.gb_clf1 = GradientBoostingClassifier() self.gb_clf2 = GradientBoostingClassifier() self.gb_clf3 = GradientBoostingClassifier() self.gb_clf1.fit(x_init1,y_init1) self.gb_clf2.fit(x_init2,y_init2) self.gb_clf3.fit(x_init3,y_init3) return self def predict(self,X_predict): assert self._X_train is not None and self._y_train is not None,"must fit before predict!" assert X_predict.shape[1] == self._X_train.shape[1],"the feature number of X_predict must be equal to X_train" scores_sort = self.__predict(X_predict) scores_sort = np.array(scores_sort) scores_sorted = np.argsort(scores_sort) X_output = X_predict.iloc[scores_sorted[-self.each_size:]] return X_output def __scores_func(self,proba): scores_sort = [] #proba形如[[0.1,0.9],[0.4,0.6],[0.7,0.3]] for sc in proba: col = 0 for p in sc: #避免出现log0 if p in [0,1]: col += 0 else: col += -p*math.log(p,math.e) scores_sort.append(col) return scores_sort def __predict(self,x_choice): proba1 = self.gb_clf1.predict_proba(x_choice) proba2 = self.gb_clf2.predict_proba(x_choice) proba3 = self.gb_clf3.predict_proba(x_choice) scores1_sort = self.__scores_func(proba1) scores2_sort = self.__scores_func(proba2) scores3_sort = self.__scores_func(proba3) x_all = pd.concat([self._X_train,x_choice]) neigh = NearestNeighbors() neigh.fit(x_all) distance_number = neigh.kneighbors([x_choice.iloc[i] for i in range(len(x_choice))], self.distance_scope, return_distance=False) score_weight1 = self.gb_clf1.score(self._X_train,self._y_train) score_weight2 = self.gb_clf2.score(self._X_train,self._y_train) score_weight3 = self.gb_clf3.score(self._X_train,self._y_train) scores_sort = [] for i in range(len(scores1_sort)): diversity = pairwise_distances([x_choice.iloc[i]],x_all.iloc[distance_number[i][1:2]],metric="cosine").sum() col = max(scores1_sort[i]*score_weight1,scores2_sort[i]*score_weight2,scores3_sort[i]*score_weight3)+100*diversity scores_sort.append(col) return scores_sort def __repr__(self): return "qbc_ddbcw(distance_scope=%d,each_size=%d)"%(self.distance_scope,self.each_size)
random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting Logistic Regression to the Training set from sklearn.ensemble import GradientBoostingClassifier classifier = GradientBoostingClassifier(loss='deviance', n_estimators=500, learning_rate=0.001, criterion='friedman_mse', random_state=0) classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) #second classifier for the prediction y_pred1 = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid( np.arange(start=X_set[:, 0].min() - 1,
from sklearn.datasets import load_breast_cancer from sklearn.model_selection import StratifiedKFold from sklearn.metrics import accuracy_score , confusion_matrix from sklearn.ensemble import GradientBoostingClassifier data = load_breast_cancer() X = data.data y = data.target skf = StratifiedKFold(n_splits = 5 ) p_rf = np.zeros(y.shape[0]) for n in range(10,100,10): for train,test in skf.split(X,y): x_train = X[train] x_test = X[test] y_train = y[train] y_test = y[test] clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) cm = confusion_matrix(y_test, y_pred) print(cm) print('score For' ,n , ' ' , accuracy_score(y_test , y_pred))
max_depth=1, # random_state=0, # max_leaf_nodes=10 ) q = 10 scores = cross_val_score(clfGB, x_treino, y_treino.ravel(), cv=q) print( # f'k = {k}', f'scores:{scores}', f'acurácia média = {round(100*mean(scores),2)} %') clfRF.fit(x_treino, y_treino.ravel()) clfGB.fit(x_treino, y_treino.ravel()) atributos_selecionados = [ 'sexo_ ', 'sexo_M', 'sexo_F', 'sexo_N', 'possui_email', 'local_onde_reside', 'tipo_endereco', 'idade', 'estado_civil', 'qtde_dependentes', 'dia_vencimento', 'possui_telefone_residencial', 'meses_na_residencia',
X = iris.data y = iris.target X y # Split and Randomize Data from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33) X_train X_test # Step 2 Define Classifier from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import accuracy_score clf2 = GradientBoostingClassifier() # Step 3 Train the Classifier clf2.fit(X_train, y_train) pred2 = clf2.predict(X_test) # Step 4: Evaluate the Classifier print("GBC accuracy score : ", accuracy_score(y_test, pred2)) # Step 5: Save the Model from sklearn.externals import joblib joblib.dump(clf, 'iris.pkl') # Step 6: Load the Model & Prediction clf = joblib.load('iris.pkl') clf
markerfacecolor='blue', markersize=10) plt.title('Variation of accuracy by the number of trees') plt.xlabel('Number of trees') plt.ylabel('Mean Accuracy') n = np.array(np.where( accuracy == np.max(accuracy))) + 1 # because the index starts at 0 n = n[0, 0] n gbm = GradientBoostingClassifier(n_estimators=5, learning_rate=0.3, max_depth=2) # Train the model using the training sets and check score gbm.fit(all_X, all_y) #Predict Output y_pred = gbm.predict(test_X) print(confusion_matrix(test_y, y_pred)) print(classification_report(test_y, y_pred)) scores = cross_val_score(gbm, all_X, all_y, cv=kfold) #scores.sort() accuracy = scores.mean() print("The accuracy for GBM is ", accuracy) submit("submission_gbm.csv", gbm) ######################################### SVM #########################################
test_size=0.1, random_state=508, stratify=got_target) # Building a gbm gbm = GradientBoostingClassifier( loss='deviance', learning_rate=1.5, n_estimators=100, max_depth=3, criterion='friedman_mse', warm_start=False, random_state=508, ) gbm_basic_fit = gbm.fit(X_train, y_train) gbm_basic_predict = gbm_basic_fit.predict(X_test) # Training and Testing Scores print('Training Score', gbm_basic_fit.score(X_train, y_train).round(4)) print('Testing Score:', gbm_basic_fit.score(X_test, y_test).round(4)) cv_lr_3 = cross_val_score(gbm, got_data, got_target, cv=3, scoring='roc_auc') print(pd.np.mean(cv_lr_3)) ######################### # Hyper Parameter Tuning #########################
# use greedy method to find two subsets: true inliners and true outliers inliers, dummy = greedy_removal(all_reps, 0.5) dummy, outliers = greedy_removal(all_reps, 1.0) print len(inliers), len(outliers), len(all_reps) if True: # train a SVM classifier with true inliners and true outliers X_inliers = [all_reps[key] for key in inliers] X_outliers = [all_reps[key] for key in outliers] X = np.vstack((X_inliers, X_outliers)) y = np.hstack(([0 for key in inliers], [1 for key in outliers])) #clf = svm.SVC() #clf = RandomForestClassifier(n_estimators=128) clf = GradientBoostingClassifier(n_estimators=128, learning_rate=1.0) clf = clf.fit(X, y) print X, y clf.fit(X, y) # perform classification using the SVM print clf.predict(all_reps.values()) results = clf.predict(all_reps.values()) inliers = [all_reps.keys()[i] for i in range(len(results)) if results[i] == 0] outliers = [all_reps.keys()[i] for i in range(len(results)) if results[i] == 1] print len(inliers), len(outliers) images_dir = os.path.dirname(os.path.realpath(all_reps.keys()[0])) main_person_dir = os.path.join(images_dir, 'main_person') other_persons_dir = os.path.join(images_dir, 'other_persons')
# In[58]: # Boosting # In[69]: #Boosting on oversampled data from sklearn.ensemble import GradientBoostingClassifier lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1] for learning_rate in lr_list: gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=learning_rate, max_features=2, max_depth=2, random_state=0) gb_clf.fit(X_t, y_t) print("Learning rate: ", learning_rate) print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_t, y_t))) print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test))) # In[60]: gb_clf = GradientBoostingClassifier(n_estimators=20, learning_rate=0.75, max_features=2, max_depth=2, random_state=0) gb_clf.fit(X_t, y_t) print("Learning rate: ", learning_rate) print("Accuracy score (training): {0:.3f}".format(gb_clf.score(X_t, y_t))) print("Accuracy score (validation): {0:.3f}".format(gb_clf.score(X_test, y_test)))
sgd = SGDClassifier() sgd.fit(x_train, y_train) y_pred = sgd.predict(x_val) acc_sgd = round(accuracy_score(y_pred, y_val) * 100, 2) print "MODEL-9: Accuracy of Stochastic Gradient Descent : ", acc_sgd #OUTPUT:- #MODEL-9: Accuracy of Stochastic Gradient Descent : 71.07 #MODEL-10) Gradient Boosting Classifier #------------------------------------------ from sklearn.ensemble import GradientBoostingClassifier gbk = GradientBoostingClassifier() gbk.fit(x_train, y_train) y_pred = gbk.predict(x_val) acc_gbk = round(accuracy_score(y_pred, y_val) * 100, 2) print "MODEL-10: Accuracy of GradientBoostingClassifier : ", acc_gbk #OUTPUT:- #MODEL-10: Accuracy of Stochastic Gradient Descent : 84.77 #Let's compare the accuracies of each model! models = pd.DataFrame({ 'Model': [ 'Logistic Regression', 'Gaussian Naive Bayes', 'Support Vector Machines', 'Linear SVC', 'Perceptron', 'Decision Tree', 'Random Forest', 'KNN', 'Stochastic Gradient Descent', 'Gradient Boosting Classifier'
param['bst:max_depth'] = 6 param['eval_metric'] = 'auc' param['silent'] = 1 param['nthread'] = 4 plst = param.items() + [('eval_metric', '[email protected]')] watchlist = [(xgmat, 'train')] # boost 10 tres num_round = 10 print('loading data end, start to boost trees') print("training GBM from sklearn") tmp = time.time() gbm = GradientBoostingClassifier(n_estimators=num_round, max_depth=6, verbose=2) gbm.fit(data, label) print("sklearn.GBM costs: %s seconds" % str(time.time() - tmp)) #raw_input() print("training xgboost") threads = [1, 2, 4, 16] for i in threads: param['nthread'] = i tmp = time.time() plst = param.items() + [('eval_metric', '[email protected]')] bst = xgb.train(plst, xgmat, num_round, watchlist) print("XGBoost with %d thread costs: %s seconds" % (i, str(time.time() - tmp))) print('finish training')
X_test = np.array(X_test).astype(np.float) y_test = temp[838860:1048575, 4] y_test = np.array(y_test).astype(np.float) print 'Training data shape: ', X_train.shape print 'Training labels shape: ', y_train.shape print 'Test data shape: ', X_test.shape print 'Test labels shape: ', y_test.shape # For comparison, compute PCA pca = PCA(n_components=4) Xtr = pca.fit_transform( X_train) # Reconstruct signals based on orthogonal components Xts = pca.fit_transform( X_test) # Reconstruct signals based on orthogonal components #clf2 = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,max_depth=1, random_state=0) Train Accuracy = 0.921114 Test Accuracy = 0.878907 #clf2 = GradientBoostingClassifier(n_estimators=300, learning_rate=0.05,max_depth=3, random_state=0) Train Accuracy = 0.983857 Test Accuracy = 0.879990 #clf2 = GradientBoostingClassifier(n_estimators=400, learning_rate=0.02,max_depth=3, random_state=0) #Train Accuracy = 0.978574 Test Accuracy = 0.882669 clf2 = GradientBoostingClassifier( n_estimators=800, learning_rate=0.4, max_depth=3, random_state=0) #Train Accuracy = 0.979995 Test Accuracy = 0.877224 clf2.fit(Xtr, y_train) accut = clf2.score(Xts, y_test) accutr = clf2.score(Xtr, y_train) print "-------------------------------------Train Accuracy = %f Test Accuracy = %f " % ( accutr, accut)
def gradient_boosting_classifier(train_x, train_y): from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=200) model.fit(train_x, train_y) print model.feature_importances_ # 显示每一个特征的重要性指标,越大说明越重要 return model
for j in corr_mat: if (i == j): continue else: if (corr_mat[i][j] > 0.2): a.add(i) print(a) sve = SVC() sve.fit(data_pd, Y_train) print(sve.score(data_pd1, Y_test)) print(sve.score(data_pd, Y_train)) grb = GradientBoostingClassifier() grb.fit(data_pd, Y_train) print(grb.score(data_pd1, Y_test)) print(grb.score(data_pd, Y_train)) cor_matt = data_pd.corr() eig_vals, eig_vecs = np.linalg.eig(cor_matt) #print(eig_vals) #print('sdaddddddddddddddd') #print(eig_vecs) '''fiting and transforming pca''' pca = PCA(n_components=9) train_features = pca.fit_transform(data_pd) test_features = pca.transform(data_pd1) sve1 = SVC() sve1.fit(train_features, Y_train)
rf.fit(X_train, y_train) rf_enc.fit(rf.apply(X_train)) rf_lm.fit(rf_enc.transform(rf.apply(X_train_lr)), y_train_lr) y_pred_rf_lm = rf_lm.predict_proba(rf_enc.transform(rf.apply(X_test)))[:, 1] fpr_rf_lm, tpr_rf_lm, _ = roc_curve(y_test, y_pred_rf_lm) print('rf_lm') auc_rf_lm += auc(fpr_rf_lm, tpr_rf_lm) score_rf_lm += cal_score(fpr_rf_lm, tpr_rf_lm) print([cal_score(fpr_rf_lm, tpr_rf_lm), auc(fpr_rf_lm, tpr_rf_lm)]) grd = GradientBoostingClassifier(n_estimators=n_estimator) grd_enc = OneHotEncoder() grd_lm = LogisticRegression() grd.fit(X_train, y_train) grd_enc.fit(grd.apply(X_train)[:, :, 0]) grd_lm.fit(grd_enc.transform(grd.apply(X_train_lr)[:, :, 0]), y_train_lr) y_pred_grd_lm = grd_lm.predict_proba( grd_enc.transform(grd.apply(X_test)[:, :, 0]))[:, 1] fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred_grd_lm) print('grd_lm') score_grd_lm += cal_score(fpr_grd_lm, tpr_grd_lm) auc_grd_lm += auc(fpr_grd_lm, tpr_grd_lm) print([cal_score(fpr_grd_lm, tpr_grd_lm), auc(fpr_grd_lm, tpr_grd_lm)]) # The gradient boosted model by itself y_pred_grd = grd.predict_proba(X_test)[:, 1]
#test_data = preprocessing.normalize(test_data, norm='l2') #----------------------------------------------------------------------------------- #clf1 = linear_model.LogisticRegression(random_state=1) clf = GradientBoostingClassifier( n_estimators=2500, #learning_rate=1.0, verbose=1, random_state=1).fit(train_data, train_label) #clf3 = GaussianNB() #eclf1 = VotingClassifier(estimators=[('lr', clf1), ('gb', clf2), ('gnb', clf3)], voting='hard') #eclf1 = eclf1.fit(train_data, train_label) #result = eclf1.predict_prob() #----------------------------------------------------------------------------------- #clf = AdaBoostClassifier(n_estimators=3000) clf.fit(train_data, train_label) #clf = BaggingClassifier(n_estimators = 2000) #clf.fit(train_data, train_label) #eclf2 = VotingClassifier(estimators=[('lr', clf1), ('gb', clf2), ('gnb', clf3)], voting='soft') #clf = RandomForestClassifier(n_estimators=6000, max_depth = 4, verbose=1).fit(train_data, train_label) #knn = neighbors.KNeighborsClassifier() #logistic = linear_model.LogisticRegression() #clf = svm.SVC(probability = True) #clf = tree.DecisionTreeClassifier() #print('KNN score: %f' % knn.fit(train_data, train_label).score(valid_data, valid_label)) #result = knn.fit(train_data, train_label).predict_proba(test_data) #train_data = train_data[0:5000,:]
def gradient_boosting_classifier(X_train, y_train): from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier(n_estimators=200, random_state=2) model.fit(X_train, y_train) return model
X_val = test[features_selected] y_val = test['Label'] #Define lists to save results for later-plotting score_ne_cv = np.zeros(20) score_ne_self = np.zeros(20) ams_ne_cv = np.zeros(20) ams_ne_self = np.zeros(20) n_estimators = range(10,210,10) #Vary number of estimators from 10 to 200 for i in range(20): clf = set_classifier(n_estimators=(i+1)*10) clf.fit(X_train, y_train) result_self = clf.predict(X_train) s_self = (result_self == 's') b_self = (result_self == 'b') result_cv = clf.predict(X_val) s_cv = (result_cv == 's') b_cv = (result_cv == 'b') score_ne_self[i] = clf.score(X_train,y_train) score_ne_cv[i] = clf.score(X_val,y_val) ams_ne_self[i] = AMS(s_self.sum(), b_self.sum()) ams_ne_cv[i] = AMS(s_cv.sum(), b_cv.sum()) #Plot socres as a function of the number of estimators