def get_most_important_features(train):
  train = train.drop('ID', 1)
  train_y = train['TARGET']
  train_X = train.drop('TARGET', 1)

  random_forest = RandomForestClassifier(n_estimators=100)
  random_forest.fit(train_X, train_y)

  feater_importance = pd.Series(random_forest.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(15  ,7), title='Feature importance by random forest')

  # plt.savefig("feature_importance.png")

  grad_boosting = GradientBoostingClassifier()
  grad_boosting.fit(train_X, train_y)

  feater_importance = pd.Series(grad_boosting.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(10,7), title='Feature importance by gradient boosting')

  # plt.savefig("feature_importance2.png")

  extra_trees = ExtraTreesClassifier()
  extra_trees.fit(train_X, train_y)

  feater_importance = pd.Series(extra_trees.feature_importances_, index=train_X.columns)
  feater_importance.sort_values(inplace=True)
  feater_importance.tail(20).plot(kind='barh', figsize=(20,7), title='Feature importance by extra trees classifier')
def test_gradient_boosting_validation_fraction():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=100,
                                     n_iter_no_change=10,
                                     validation_fraction=0.1,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)
    gbc2 = clone(gbc).set_params(validation_fraction=0.3)
    gbc3 = clone(gbc).set_params(n_iter_no_change=20)

    gbr = GradientBoostingRegressor(n_estimators=100, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    validation_fraction=0.1,
                                    random_state=42)
    gbr2 = clone(gbr).set_params(validation_fraction=0.3)
    gbr3 = clone(gbr).set_params(n_iter_no_change=20)

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    # Check if validation_fraction has an effect
    gbc.fit(X_train, y_train)
    gbc2.fit(X_train, y_train)
    assert gbc.n_estimators_ != gbc2.n_estimators_

    gbr.fit(X_train, y_train)
    gbr2.fit(X_train, y_train)
    assert gbr.n_estimators_ != gbr2.n_estimators_

    # Check if n_estimators_ increase monotonically with n_iter_no_change
    # Set validation
    gbc3.fit(X_train, y_train)
    gbr3.fit(X_train, y_train)
    assert gbr.n_estimators_ < gbr3.n_estimators_
    assert gbc.n_estimators_ < gbc3.n_estimators_
def test_plot_partial_dependence_input():
    # Test partial dependence plot function input checks.
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)

    # not fitted yet
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, [0])

    clf.fit(X, y)

    assert_raises(ValueError, plot_partial_dependence,
                  clf, np.array(X)[:, :0], [0])

    # first argument must be an instance of BaseGradientBoosting
    assert_raises(ValueError, plot_partial_dependence,
                  {}, X, [0])

    # must be larger than -1
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, [-1])

    # too large feature value
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, [100])

    # str feature but no feature_names
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, ['foobar'])

    # not valid features value
    assert_raises(ValueError, plot_partial_dependence,
                  clf, X, [{'foo': 'bar'}])
Example #4
0
def main(args):
    global verbose
    verbose = args.verbose

    # Load files
    if verbose: logger.info('Loading {}'.format(args.train_file))
    train_X, train_y = load_file(args.train_file)
    if verbose: logger.info('Loading {}'.format(args.test_file))
    test_X, test_y = load_file(args.test_file)

    # # Codes for Grid Search
    # params = [
    #     {'n_estimators': [50000], 'learning_rate': [2**i for i in np.arange(-10, -9, .25)], 'max_features': ['log2',], 'max_depth': [7,]},
    # ]
    # method = GradientBoostingClassifier(random_state=1, verbose=1)
    # gscv = GridSearchCV(method, params, scoring='roc_auc', verbose=verbose, n_jobs=5)
    # gscv.fit(train_X.toarray(), train_y)
    # if verbose:
    #     for params, mean_score, all_scores in gscv.grid_scores_:
    #         logger.info('{:.6f} (+/- {:.6f}) for {}'.format(mean_score, all_scores.std() / 2, params))
    #     logger.info('params:{params}'.format(params=gscv.best_params_))
    #     logger.info('score:{params}'.format(params=gscv.best_score_))
    # pred = gscv.best_estimator_.predict_proba(test_X.toarray())

    # Best parameters for the competition data
    method = GradientBoostingClassifier(n_estimators=50000, learning_rate=2**(-9,5),
                                        max_features='log2', max_depth=7
                                        random_state=1, verbose=1)
    method.fit(train_X.toarray(), train_y)
    pred = method.predict_proba(test_X.toarray())

    np.savetxt(args.output, pred[:, 1], fmt='%.6f')
    if verbose: logger.info('Wrote preds to {file}'.format(file=args.output))

    return 0
def test_gradient_boosting_early_stopping():
    X, y = make_classification(n_samples=1000, random_state=0)

    gbc = GradientBoostingClassifier(n_estimators=1000,
                                     n_iter_no_change=10,
                                     learning_rate=0.1, max_depth=3,
                                     random_state=42)

    gbr = GradientBoostingRegressor(n_estimators=1000, n_iter_no_change=10,
                                    learning_rate=0.1, max_depth=3,
                                    random_state=42)

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        random_state=42)
    # Check if early_stopping works as expected
    for est, tol, early_stop_n_estimators in ((gbc, 1e-1, 24), (gbr, 1e-1, 13),
                                              (gbc, 1e-3, 36),
                                              (gbr, 1e-3, 28)):
        est.set_params(tol=tol)
        est.fit(X_train, y_train)
        assert_equal(est.n_estimators_, early_stop_n_estimators)
        assert est.score(X_test, y_test) > 0.7

    # Without early stopping
    gbc = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1,
                                     max_depth=3, random_state=42)
    gbc.fit(X, y)
    gbr = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1,
                                    max_depth=3, random_state=42)
    gbr.fit(X, y)

    assert gbc.n_estimators_ == 100
    assert gbr.n_estimators_ == 200
def model_train_ensemble(X1,Y1,Save = False, modelname = None):
    
    X1,Y1 = DowmSample(X1,Y1,9)
    
#     model = RandomForestClassifier(n_estimators=100,random_state=1)
    model = GradientBoostingClassifier(n_estimators=100,max_leaf_nodes=5, subsample=0.7, learning_rate=0.1, random_state=1)
#     model = LogisticRegression('l2')
    model.fit(X1, Y1.ravel())
    
    # 保存模型
    if Save == True:
        f = open(modelname,'w')
        pickle.dump(model, f)
        f.close()
    
    print '\n -------------- Training is over ----------------------'    
    return model





    

    
def test_partial_dependecy_input():
    # Test input validation of partial dependence.
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)

    assert_raises(ValueError, partial_dependence,
                  clf, [0], grid=None, X=None)

    assert_raises(ValueError, partial_dependence,
                  clf, [0], grid=[0, 1], X=X)

    # first argument must be an instance of BaseGradientBoosting
    assert_raises(ValueError, partial_dependence,
                  {}, [0], X=X)

    # Gradient boosting estimator must be fit
    assert_raises(ValueError, partial_dependence,
                  GradientBoostingClassifier(), [0], X=X)

    assert_raises(ValueError, partial_dependence, clf, [-1], X=X)

    assert_raises(ValueError, partial_dependence, clf, [100], X=X)

    # wrong ndim for grid
    grid = np.random.rand(10, 2, 1)
    assert_raises(ValueError, partial_dependence, clf, [0], grid=grid)
Example #8
0
def transform_with_gbm_to_categorical(header, tr_x, tr_y, ts_x, n_est=100, learning_rate=0.1, max_depth=5):

    clf = GradientBoostingClassifier(n_estimators=n_est, learning_rate=learning_rate, max_depth=max_depth)
    clf = clf.fit(tr_x, tr_y)

    """ #Node count
    estimators = clf.estimators_
    for row in estimators:
        for e in row:
            print(e.tree_.node_count)"""
    leaf_indices = clf.apply(tr_x)
    leaf_indices = leaf_indices.reshape(leaf_indices.shape[0], -1)

    ts_leaf_indices = clf.apply(ts_x)
    ts_leaf_indices = ts_leaf_indices.reshape(ts_leaf_indices.shape[0], -1)

    enc = OneHotEncoder()
    enc.fit(np.append(leaf_indices, ts_leaf_indices, axis=0))

    tr_cat_features = enc.transform(leaf_indices).toarray()
    ts_cat_features = enc.transform(ts_leaf_indices).toarray()

    header = ["cat_" + str(i) for i in range(ts_cat_features.shape[1])]
    print("[gbm_cat] Features size: ", len(header))
    return header, tr_cat_features, ts_cat_features
Example #9
0
 def train_GBDT(self):
     samples=self.trainset.values
     target=self.trainlabel.values
     classifier_GB=GradientBoostingClassifier(n_estimators=1000)
     classifier_GB.fit(samples,target)
     
     return classifier_GB
Example #10
0
def classify2(dis_data, numeric_data, t_label):
    fold = 5
    skf = StratifiedKFold(t_label, fold)
    roc_auc = 0  
    f1_score_value = 0

    clf1 = LogisticRegression()
    clf2 = GradientBoostingClassifier()
#    clf3 = tree.DecisionTreeClassifier(max_depth=500, max_leaf_nodes= 500, class_weight={1:12})
    clf3 = GradientBoostingClassifier()
    
    for train, test in skf:
        clf3 = clf3.fit(dis_data.iloc[train], t_label.iloc[train])
        
        #compute auc
        probas_  = clf3.predict_proba(dis_data.iloc[test])
        fpr, tpr, thresholds = roc_curve(t_label.iloc[test], probas_[:, 0])
        roc_auc += auc(fpr, tpr)    
        
        #compute f1_score
        label_pred = clf3.predict(dis_data.iloc[test])
        
        f1_score_value += f1_score(t_label.iloc[test], label_pred, pos_label= 1)
        
    return roc_auc / fold, f1_score_value / fold     
def PlotFeaturesImportance(X,y,featureNames,dataName):
    '''
    Plot the relative contribution/importance of the features.
    Best to reduce to top X features first - for interpretability
    Code example from:
    http://bugra.github.io/work/notes/2014-11-22/an-introduction-to-supervised-learning-scikit-learn/
    '''
    gbc = GradientBoostingClassifier(n_estimators=40)
    gbc.fit(X, y)
    # Get Feature Importance from the classifier
    feature_importance = gbc.feature_importances_
    # Normalize The Features
    feature_importance = 100 * (feature_importance / feature_importance.max())
    sorted_idx = numpy.argsort(feature_importance)
    pos = numpy.arange(sorted_idx.shape[0]) + 4.5
    # pos = numpy.arange(sorted_idx.shape[0])
    # plt.figure(figsize=(16, 12))
    plt.figure(figsize=(14, 9), dpi=250)
    plt.barh(pos, feature_importance[sorted_idx], align='center', color='#7A68A6')
    #plt.yticks(pos, numpy.asanyarray(df.columns.tolist())[sorted_idx]) #ORIG
    plt.yticks(pos, numpy.asanyarray(featureNames)[sorted_idx])

    plt.xlabel('Relative Importance')
    plt.title('%s: Top Features' %(dataName))
    plt.grid('off')
    plt.ion()
    plt.show()
    plt.savefig(str(dataName)+'TopFeatures.png',dpi=200)
Example #12
0
def final_run(X,Y,Xtest,n_est):
    clf = GradientBoostingClassifier(n_estimators=n_est,random_state=n_est)
    clf = clf.fit(X,Y)
    #np.savetxt('gb_oob_improve_{}'.format(n_est),clf.oob_score_)
    #np.savetxt('gb_train_score_{}'.format(n_est),clf.train_score_)
    Ytest=clf.predict(Xtest)
    output(Ytest,'gradient_boost_{}.csv'.format(n_est))
Example #13
0
def main():
	makeSub = True
	featureImportance = False
	cvfold = True
	df = pd.read_csv('../data/cprobTrain15NA.csv')

	X, y = np.array(pd.read_csv('../data/train.csv',usecols=range(1,9))), np.array(pd.read_csv('../data/train.csv').ACTION)
	X = np.hstack((X,np.array(df)))

	params = {'max_depth':4, 'subsample':0.5, 'verbose':0, 'random_state':1337,
		'min_samples_split':10, 'min_samples_leaf':10, 'max_features':10,
		'n_estimators': 350, 'learning_rate': 0.05}	

	clf = GradientBoostingClassifier(**params)
	prefix = 'lib/gbm350d4m10c15'
	if cvfold:
		c = classifier.Classifier(X,y)
		c.validate(clf,nFolds=10,out=prefix+'Train.csv')

	if makeSub:
		Xt = np.array(pd.read_csv('../data/test.csv',usecols=range(1,9)))
		Xt = np.hstack((Xt,np.array(pd.read_csv('../data/cprobTest15NA.csv'))))
		clf.fit(X,y)
		y_ = clf.predict_proba(Xt)[:,1]
		out = pd.read_csv('subs/nbBaseTest.csv')
		out.ACTION = y_
		out.to_csv(prefix+'Test.csv',index=False)

	if featureImportance:
		print "Feature ranking:"
		importances = clf.feature_importances_
		indices = np.argsort(importances)[::-1]
		np.savetxt('indices.txt',indices,delimiter=',')
		for f in xrange(df.shape[1]):
			print "%d. feature (%s,%f)" % (f + 1, df.columns[indices[f]], importances[indices[f]])
def gradientboost_prediction(features_train, labels_train, features_test, ids):

    class RandomForestClassifier_compability(RandomForestClassifier):
        def predict(self, X):
            return self.predict_proba(X)[:, 1][:,np.newaxis]

    base_estimator = RandomForestClassifier_compability()

    clf = GradientBoostingClassifier(loss='deviance', learning_rate=0.1,
                             n_estimators=5, subsample=0.3,
                             min_samples_split=2,
                             min_samples_leaf=1,
                             max_depth=3,
                             init=base_estimator,
                             random_state=None,
                             max_features=None,
                             verbose=2,
                             learn_rate=None)

    clf = clf.fit(features_train, labels_train)

    pred = clf.predict_proba(features_test)[:,1]

    # feature_importance = clf.feature_importances_
    #
    # print (feature_importance)

    predictions_file = open("data/rf_prediction.csv", "wb")
    predictions_file_object = csv.writer(predictions_file)
    predictions_file_object.writerow(["ID", "TARGET"])
    predictions_file_object.writerows(zip(ids, pred))
    predictions_file.close()
def train():
    posi_result = {}
    train_feature, test_feature, train_id_list, test_id_list, train_tar_list = merge_feature(feature_str)
    tmp1 = [m < 32 for m in trainTarList]
    tmp1 = np.array(tmp1)
    # train_feature = train_feature[tmp1]
    target_list = np.array(trainTarList)
    target_list = target_list[tmp1]
    # train_id_list = np.array(train_id_list)
    # train_id_list = train_id_list[tmp1]
    c_feature = trainFeature.columns[:]
    clf1 = RandomForestClassifier(n_estimators=200, min_samples_split=17)
    clf1.fit(trainFeature[c_feature], target_list)
    # rf_preds = clf1.predict(test_feature)
    rf_prob = clf1.predict_proba(test_feature)
    gbdt1 = GradientBoostingClassifier(n_estimators=150, min_samples_split=17)
    gbdt1.fit(trainFeature[c_feature], target_list)
    # gbdt_preds = gbdt1.predict(test_feature)
    gbdt_prob = gbdt1.predict_proba(test_feature)
    all_prob = rf_prob + gbdt_prob
    all_preds = []
    print all_prob.shape
    for k in range(all_prob.shape[0]):
        prob1 = list(allProb[k, :])
        ind1 = prob.index(max(prob1))
        allPreds.append(ind1)
    for j in range(len(all_preds)):
        all_pre_name = dl.get_num_position(all_preds[j])
        posi_result[test_id_list[j]] = all_pre_name
    return posi_result
def gbc_gp_predict(train_x, train_y, test_x):
    feature_indexs = getTopFeatures(train_x, train_y)
    sub_x_Train = get_data(
        train_x,
        feature_indexs[:16],
        features.feature_pair_sub_list,
        features.feature_pair_plus_list,
        features.feature_pair_mul_list,
        features.feature_pair_divide_list[:20],
    )
    sub_x_Test = get_data(
        test_x,
        feature_indexs[:16],
        features.feature_pair_sub_list,
        features.feature_pair_plus_list,
        features.feature_pair_mul_list,
        features.feature_pair_divide_list[:20],
    )
    labels = toLabels(train_y)
    gbc = GradientBoostingClassifier(n_estimators=3000, max_depth=9)
    gbc.fit(sub_x_Train, labels)
    pred_probs = gbc.predict_proba(sub_x_Test)[:, 1]
    ind_test = np.where(pred_probs > 0.55)[0]
    gp_preds_part = gbc_gp_predict_part(sub_x_Train, train_y, sub_x_Test[ind_test])
    gp_preds = np.zeros(len(test_x))
    gp_preds[ind_test] = gp_preds_part
    return gp_preds
Example #17
0
def ada_boost():
    savefile = open('traindata.pkl', 'rb')
    (x_train, y_train, t1) = cPickle.load(savefile)
    savefile.close()
    savefile = open('testdata.pkl', 'rb')
    (x_test, t1, name1) = cPickle.load(savefile)
    savefile.close()
    
#    X_train, X_valid, y_train, y_valid = cross_validation.train_test_split(
#    X, y, test_size=0.1, random_state=42)
    
    x_train = np.asarray(x_train,dtype=np.float32)
    y_train = np.asarray(y_train, dtype='int32')-1   
    
    nest = 190
    lr = .1
    md = 6
#    clf1 = DecisionTreeClassifier(max_depth=2)
#    clf = AdaBoostClassifier(clf1, n_estimators=200, learning_rate=.25)
    clf = GradientBoostingClassifier(n_estimators=nest, learning_rate=lr, max_depth=md, random_state=0)
#    clf = RandomForestClassifier(n_estimators=200) #.81
#    clf = ExtraTreesClassifier(n_estimators=1000, max_depth=None, min_samples_split=10, random_state=0,n_jobs=8) #.81
#    clf = KNeighborsClassifier(15)
    if 1:
        clf.fit(x_train, y_train)
        ypred = clf.predict_proba(x_test)
        y_str = ['Class_1','Class_2','Class_3','Class_4','Class_5','Class_6','Class_7','Class_8','Class_9']
        kcsv.print_csv(ypred, name1, y_str,indexname='id')
        print (nest, lr, md) 
    
    if 0:
        multiclass_log_loss = make_scorer(score_func=logloss_mc, greater_is_better=True, needs_proba=True)
        scores = cross_val_score(clf, x_train, y_train, n_jobs=8, cv=5,scoring=multiclass_log_loss)
        print scores
        print (nest, lr, md, scores.mean())  
Example #18
0
def GradBoost(X_DS, Y_DS, X_train, X_test, y_train, y_test, Cl_Names = 'None', mask='None',Max_Depth=3):
#******************************************************************************

	from sklearn.ensemble import GradientBoostingClassifier as GBC #import library for machine learning analysis
	from sklearn.metrics import classification_report

	print 'Gradient Boosting: Training...' #notify the user about the status of the process 

	Gradient_Boosting_obj = GBC(max_depth=Max_Depth) #call the Gradient Boosting routine built in
	Gradient_Boosting_obj.fit(X_train, y_train) #fit the logistic model to the train data sets
	Pred_Train = Gradient_Boosting_obj.predict(X_train) #apply the logistic model to the train dataset
	Pred_Test = Gradient_Boosting_obj.predict(X_test) #apply the logistic model to the test dataset

	print 'Gradient Boosting: Completed!' #notify the user about the status of the process

	labels = len(np.unique(Y_DS)) #extract the labels from the classification classes
	Conf_M = np.zeros((labels,labels), dtype='int') #initialize the confusion matrix for the classification problem
	
	if Cl_Names != 'None':
		target_names = Cl_Names
	else:
		target_names = np.arange(len(np.unique(Y_DS))).astype(str).tolist()
	#end

	Conf_M = CM(y_test, Pred_Test,np.unique(Y_DS)) #calls the confusion matrix routine with the test set and prediction set

	print(classification_report(y_test, Pred_Test, target_names=target_names))  #print the performance indicators on the console

	return Gradient_Boosting_obj, Conf_M
 def get_n_fold_validation_score(self, fold=10):
     features = data.get_features()
     lables = data.get_lables()
     length = len(features)
     jump = length / fold
     index = 0
     k = 0
     scores = list()
     while k < fold:
         feature_test = features.iloc[index : (index + jump), :]
         lable_test = lables.iloc[index : (index + jump), :]
         feature_train_1, feature_train_2 = (
             features.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(),
             features.iloc[index + jump + 1 : length - 1],
         )
         feature_train = pd.concat([feature_train_1, feature_train_2])
         lable_train_1, lable_train_2 = (
             lables.iloc[0 : index - 1, :] if index != 0 else pd.DataFrame(),
             lables.iloc[index + jump + 1 : length - 1],
         )
         lable_train = pd.concat([lable_train_1, lable_train_2])
         index += jump
         k += 1
         classifier = GradientBoostingClassifier()
         classifier.fit(feature_train, lable_train["lable"].values)
         scores.append(accuracy_score(lable_test, classifier.predict(feature_test)))
     return sum(scores) / float(len(scores))
Example #20
0
    def gbdt_train(self, data, task_id, window=DEFAULT_WINDOW):
        """
        Train a gbdt model.

        :param data: Training dataset.
        :param task_id: The id of the training task.
        :param window: the length of window
        """
        X_train = []
        y_train = []
        features = self.__calculate_features(data, window)
        if features:
            return TSD_LACK_SAMPLE
        for index in features:
            X_train.append(index[0])
            y_train.append(index[1])
        X_train = np.array(X_train)
        y_train = np.array(y_train)
        try:
            grd = GradientBoostingClassifier(n_estimators=self.n_estimators, max_depth=self.max_depth, learning_rate=self.learning_rate)
            grd.fit(X_train, y_train)
            model_name = MODEL_PATH + task_id + "_model"
            joblib.dump(grd, model_name)
        except Exception as ex:
            return TSD_TRAIN_ERR, str(ex)
        return TSD_OP_SUCCESS, ""
Example #21
0
def mse_sklearn(x_train, x_test, y_train, y_test, n_estimators):
    clf = GradientBoostingClassifier(n_estimators=n_estimators,
                                     min_samples_leaf=MIN_SAMPLES_LEAF,
                                     max_depth=MAX_DEPTH)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_test)
    return f1_score(y_test, pred)
Example #22
0
class Blender(BaseEstimator, ClassifierMixin):
    def __init__(self, trained_clfs):
        self.clfs = trained_clfs
        # self.classifier = make_pipeline(OneHotEncoder(), DenseTransformer(),
        #                                 GradientBoostingClassifier())
        self.classifier = GradientBoostingClassifier()
        # self.classifier = make_pipeline(
        #     OneHotEncoder(), LogisticRegression(class_weight='auto'))

    def fit(self, data, target):
        # self.enc = LabelEncoder().fit(target)
        probs = self.transform_input(data)
        # self.classifier.fit(predictions, target)
        self.classifier.fit(probs, target)

    def predict(self, data):
        predictions = self.transform_input(data)
        return self.classifier.predict(predictions)

    def transform_input(self, data):
        probabilities = [clf.predict_proba(data) for clf in self.clfs]

        probabilities = np.array(probabilities)
        # features, samples = probabilities.shape
        n_clfs, samples, features = probabilities.shape
        probabilities = np.reshape(probabilities, (samples, n_clfs * features))
        probabilities[np.isnan(probabilities)] = 0
        return probabilities
def partial_dependence(df, y):
    '''
    INPUT: X = features
           y = target variable binary, imbalanced classes
    OUPUT: X = features oversampled to have balanced target classes
           y = target variable oversample to have balanced classes

    Discovers the minority class and then oversamples until eah class makes up
    50% of your data.
    '''
    X_train, X_test, y_train, y_test = oversample_train_test(df, y)
    # X_train, X_test, y_train, y_test = train_test_split(df, y, random_state=42)

    feature_engineering = Pipeline([
        ('lists', ListSplitter()),
        ('race', RaceDummies()),
        ('crime_sentence', CrimeAndSentence()),
        ('feat_eng', FeatureEngineer()),
        ('columns', ColumnFilter(prejudice=False))
    ])

    X = feature_engineering.fit_transform(X_train.copy(), y_train)
    X_test = feature_engineering.fit_transform(X_test.copy(), y_test)

    gbc = GradientBoostingClassifier(n_estimators=850, learning_rate=.75)
    gbc.fit(X.copy(), y_train)
    most_imp = np.argsort(gbc.feature_importances_)[-6:]

    names = list(X_test.columns)
    feats = list(most_imp)
    fig, axs = plot_partial_dependence(gbc, X_test, feats, feature_names=names,
                                       n_jobs=3, grid_resolution=50)
def run_gradient_boosting_classifier(data, _max_depth):
    (feature_train, feature_test, label_train, label_test) = train_test_split(data[:, 0:-1], data[:, -1].astype(int),
                                                                              test_size=0.25)
    # TODO: Vary Number of Estimators and Learning Rate
    gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, max_depth=_max_depth, verbose = True)
    gbc.fit(feature_train, label_train)
    training_error = gbc.score(feature_train, label_train)
    #cross_validation_score = cross_val_score(gbc, feature_train, label_train, cv=10)
    testing_error = gbc.score(feature_test, label_test)

    print "Random Forest Results for Max Depth:", _max_depth
    print "Training Accuracy:", training_error
    #print "10-fold Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (cross_validation_score.mean(), cross_validation_score.std() * 2)
    print "Testing Accuracy:", testing_error

    feature_importance = gbc.feature_importances_
    stddev = np.std([tree[0].feature_importances_ for tree in gbc.estimators_], axis=0)
    indices = np.argsort(feature_importance)[::-1]

    # Print the feature ranking
    print("Feature ranking:")
    for f in range(len(feature_importance)):
        print("%d. feature %d (%f)" % (f + 1, indices[f], feature_importance[indices[f]]))

    plot_feature_importance(feature_importance, indices, stddev, "gradient-boosted-classifier-feature-importance-depth-" + str(_max_depth))
def main():
    print '[INFO, time: %s] Getting Data....' % (time.strftime('%H:%M:%S'))
    testing_file = file('test.p', 'r')
    training_file = file('train.p', 'r')

    train = pickle.load(training_file)
    test = pickle.load(testing_file)

    testing_file.close()
    training_file.close()
    
    trainX = train[:,:-1]
    trainy = train[:,-1]
    
    testX = test[:,:-1]
    testy = test[:,-1]

    print '[INFO, time: %s] Fitting %s ...' % (time.strftime('%H:%M:%S'), 'GradientBoostingClassifier(n_estimators=1000)')
    clf = GradientBoostingClassifier(n_estimators=1000)
    clf.fit(trainX, trainy)

    print '[INFO, time: %s] Making Predictions...' % (time.strftime('%H:%M:%S'))
    prediction = clf.predict(testX)
    print '[RESULT, time: %s] accuracy = %f' % (time.strftime('%H:%M:%S'),accuracy_score(testy, prediction))


    model_save_file = file('gradient_1000.p', 'w')
    pickle.dump(clf, model_save_file)
    model_save_file.close()
    print 'All done'
Example #26
0
def ctr_gbdt(model='sklearn-clicklog', from_cache=False, train_dataset_length=100000, test_dataset_length=100000):
    TRAIN_FILE, TEST_FILE = create_dataset(model, from_cache, train_dataset_length, test_dataset_length)

    prediction_model = GradientBoostingClassifier(
        loss='deviance',
        learning_rate=0.1,
        n_estimators=30,
        subsample=1.0,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.0,
        max_depth=5,
    )

    x_train, y_train = clean_data(TRAIN_FILE)
    x_test, y_test = clean_data(TEST_FILE)

    with Timer('fit model'):
        prediction_model.fit(x_train, y_train)

    with Timer('evaluate model'):
        y_prediction_train = prediction_model.predict_proba(x_train)
        y_prediction_test = prediction_model.predict_proba(x_test)

        loss_train = log_loss(y_train, y_prediction_train)
        loss_test = log_loss(y_test, y_prediction_test)

    print 'loss_train: %s' % loss_train
    print 'loss_test: %s' % loss_test
Example #27
0
def predict(fea, df, t, t9):
    Un = df.columns == 'Blank'
    for f in Fea:
        '''        
        try:
            df[(f+'_y')] = df[(f+'_x')] - df[(f+'_y')]
            print(1)
        except:
            pass
        '''
        Un = Un | (df.columns == f)
        Un = Un | (df.columns == (f+'_x'))
        Un = Un | (df.columns == (f+'_y'))
    Un = Un & (df.columns != 'New_y')    
    clf = GradientBoostingClassifier()
    y = df[t].label
    X = df[t].ix[:,Un]
    X_train, X_test, y_train, y_test=train_test_split(X, y, test_size = 0.9, random_state = 1)
    clf.fit(X_train, y_train)
    re = 'Testing AUC: \t' + str(roc_auc_score(y_test,clf.predict_proba(X_test)[:,1]))  
    print re
    re =  'September AUC: \t' + str(roc_auc_score(df[t9].label,clf.predict_proba(df[t9].ix[:,Un])[:,1]))
    print re
    print(X.columns)
    print(clf.feature_importances_)
    return Un, clf
Example #28
0
 def trainModelComb4(self):
     ntrain = self.data_train.shape[0]
     self.xtra = 5
     est_prob = np.zeros([ntrain,self.xtra+1]) #for original data, essay and others, which would be fed to a second gb 
     
     self.mlmodel2 = [LogisticRegression() for i in range(self.xtra)]
     for i in range(self.xtra-1):  
         self.mlmodel2[i].fit(self.data_train,self.labels_train[:,i+1])
         set_result =  self.mlmodel2[i].predict_proba(self.data_train)
         est_prob[:,i] = set_result[:,1]
                 
     self.mlmodel2[self.xtra-1].fit(self.data_train_ess,self.labels_train[:,0])
     set_result2 = self.mlmodel2[self.xtra-1].predict_proba(self.data_train_ess)
     est_prob[:,self.xtra-1] = set_result2[:,1]
     
     #self.data_train = np.hstack((self.data_train,est_prob))
     
     #self.mlmodel = AdaBoostClassifier()
     self.mlmodel = GradientBoostingClassifier(learning_rate=0.2,subsample=0.4)
     #self.mlmodel = RandomForestClassifier(n_estimators = 200, n_jobs=3,verbose =1)    
     self.mlmodel.fit(self.data_train,self.labels_train[:,0])
     set_result3 = self.mlmodel.predict_proba(self.data_train)
     est_prob[:,self.xtra] = set_result3[:,1]
     
     #2nd layer GB
     self.mlmodel3 = GradientBoostingClassifier(learning_rate=0.1)
     self.mlmodel3.fit(est_prob,self.labels_train[:,0])
def train_classifiers(X_data, y_data):
    ############ Linear SVM: 0.908 #############
    clf_LSVM = svm.SVC(kernel = 'linear')
    clf_LSVM.fit(X_data, y_data)
    
    ############ MultinomialNB: 0.875 #############
    clf_MNB = MultinomialNB()
    clf_MNB.fit(X_data, y_data)
    
    ############ Random Forest: 0.910 #############
    clf_RF = RandomForestClassifier(n_estimators=200, criterion='entropy')
    clf_RF.fit(X_data, y_data)
    
    ############ Extra Tree: 0.915 ##################
    clf_ETC = ExtraTreesClassifier(n_estimators=500, max_depth=None, min_samples_split=1, random_state=0)
    clf_ETC.fit(X_data, y_data)
    
    ############ AdaBoost: 0.88 ##################
    clf_Ada = AdaBoostClassifier()
    clf_Ada.fit(X_data, y_data)
    
    ############ rbf SVM: 0.895 #############
    clf_rbf = svm.SVC(C=200, gamma=0.06, kernel='rbf')
    clf_rbf.fit(X_data, y_data)
    
    ############ GradientBoosting: 0.88 #############
    clf_GBC = GradientBoostingClassifier()
    clf_GBC.fit(X_data, y_data)
    
    return clf_LSVM, clf_MNB, clf_RF, clf_ETC, clf_Ada, clf_rbf, clf_GBC    
def test_oob_improvement():
    """Test if oob improvement has correct shape and regression test. """
    clf = GradientBoostingClassifier(n_estimators=100, random_state=1, subsample=0.5)
    clf.fit(X, y)
    assert clf.oob_improvement_.shape[0] == 100
    # hard-coded regression test - change if modification in OOB computation
    assert_array_almost_equal(clf.oob_improvement_[:5], np.array([0.19, 0.15, 0.12, -0.12, -0.11]), decimal=2)
Example #31
0
                                 learning_rate=0.1,
                                 subsample=0.81,
                                 colsample_bytree=0.61,
                                 max_depth=3,
                                 random_state=0)

    if os.path.exists("model/slot_gbdt.model"):
        with open("model/slot_gbdt.model", "rb") as f:
            gbdt_model = pickle.load(f)
    else:
        print("Waring : GBDT model not found, default model used")
        exit()
        gbdt_model = GradientBoostingClassifier(learning_rate=0.1,
                                                random_state=0,
                                                n_estimators=30,
                                                min_samples_split=2,
                                                min_samples_leaf=8,
                                                max_features=0.79,
                                                subsample=0.78,
                                                max_depth=5)

    base_models = [rf_model, gbdt_model, xgb_model]
    # stacker = LogisticRegression(random_state=43)
    stacker = XGBClassifier(random_state=42)

    ensemble = Ensemble(n_folds=5, stacker=stacker, base_models=base_models)

    y_pre = ensemble.fit_predict(X=x_train, y=y_train, T=x_test)

    print(y_test)
    print(y_pre)
    recall = recall_score(y_test, y_pre)
Example #32
0
def do_gbdt4(train_x,
             train_y,
             test_x=None,
             test_y=None,
             learning_rate=0.03,
             max_depth=8,
             max_features=25,
             n_estimators=600,
             load=False,
             save=True,
             outfile=None,
             search=False,
             log=False):
    if search == False:
        if log == True:
            mdl_name = 'gbdt_log_train_lr' + str(learning_rate) + '_n' + str(
                n_estimators) + '_maxdep' + str(max_depth) + '.pkl'
        else:
            mdl_name = 'gbdt_train_lr' + str(learning_rate) + '_n' + str(
                n_estimators) + '_maxdep' + str(max_depth) + '.pkl'
        if os.path.exists(mdl_name) == True:
            clf_gbdt = joblib.load(mdl_name)
        else:
            # create gradient boosting
            clf_gbdt = GradientBoostingClassifier(learning_rate=learning_rate,
                                                  max_depth=max_depth,
                                                  max_features=max_features,
                                                  n_estimators=n_estimators)
            #n_estimators=500, learning_rate=0.5, max_depth=3)
            clf_gbdt.fit(train_x, train_y)
            if save == True:
                try:
                    _ = joblib.dump(clf_gbdt, mdl_name, compress=1)
                except:
                    print("*** Save GBM model to pickle failed!!!")
                    if outfile != None:
                        outfile.write("*** Save RF model to pickle failed!!!")
        if test_x != None and test_y != None:
            probas_gbdt = clf_gbdt.predict_proba(test_x)[:, 1]
            score_gbdt = roc_auc_score(test_y, probas_gbdt)
            print("GBDT ROC score", score_gbdt)
        return clf_gbdt
    else:
        max_depth_list = [6, 7, 8, 9, 10]
        n_list = [2000]
        lr_list = [0.005, 0.003]
        max_feat_list = [15, 16, 17, 18, 20]
        info = {}
        for md in max_depth_list:
            for n in n_list:
                for lr in lr_list:
                    for mf in max_feat_list:
                        print 'max_depth = ', md
                        print 'n = ', n
                        print 'learning rate = ', lr
                        print 'max feature = ', mf
                        # n_estimators=500, learning_rate=0.5, max_depth=3)
                        mdl_name = 'gbdt_n' + str(n) + '_lr' + str(
                            lr) + '_md' + str(md) + 'mf' + str(mf) + '.pkl'
                        if os.path.exists(mdl_name) == True:
                            clf_gbdt = joblib.load(mdl_name)
                        else:
                            clf_gbdt = GradientBoostingClassifier(
                                learning_rate=learning_rate,
                                max_depth=md,
                                max_features=mf,
                                n_estimators=n_estimators)
                            clf_gbdt.fit(train_x, train_y)
                            _ = joblib.dump(clf_gbdt, mdl_name, compress=1)
                        probas_gbdt = clf_gbdt.predict_proba(test_x)[:, 1]
                        score_gbdt = roc_auc_score(test_y, probas_gbdt)
                        info[md, n, lr, mf] = score_gbdt
        for md in info:
            scores = info[md]
            print(
                'GBDT max_depth = %d, n = %d, lr = %.5f, max_feature = %d, ROC score = %.5f(%.5f)'
                % (md[0], md[1], md[2], md[3], scores.mean(), scores.std()))
Example #33
0
df3 = df3.drop("Address", 1)
df3 = df3.drop("Dates", 1)
df3 = df3.drop('id', 1)
"""
scaler = MinMaxScaler()
numerical = ['X','Y','Time','Date']
df1[numerical] = scaler.fit_transform(df1[numerical])
df3[numerical] = scaler.transform(df3[numerical])
"""
X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = model_selection.train_test_split(
    df1, train_target, random_state=0)

clf = GradientBoostingClassifier(n_estimators=100,
                                 learning_rate=0.1,
                                 min_samples_leaf=20,
                                 max_depth=5,
                                 random_state=0)
clf.fit(X_train_sub, y_train_sub)
print("Accuracy score (training): {0:.3f}".format(
    clf.score(X_train_sub, y_train_sub)))
print("Accuracy score (validation): {0:.3f}".format(
    clf.score(X_validation_sub, y_validation_sub)))
print()
"""

clf = GradientBoostingClassifier(n_estimators=100, learning_rate = 0.5, min_samples_leaf=10, max_depth = 5, random_state = 0)

X_train_sub, X_validation_sub, y_train_sub, y_validation_sub = model_selection.train_test_split(df1, train_target, random_state=0)

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
Example #34
0
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Parameter evaluation with GSC validation
gbe = GradientBoostingClassifier(random_state=42)
parameters = {
    'learning_rate': [0.05, 0.1, 0.5],
    'max_features': [0.5, 1],
    'max_depth': [3, 4, 5]
}
gridsearch = GridSearchCV(gbe, parameters, cv=100, scoring='roc_auc')
gridsearch.fit(X, y)
print(gridsearch.best_params_)
print(gridsearch.best_score_)

# Adjusting development threshold
gbi = GradientBoostingClassifier(learning_rate=0.05,
                                 max_depth=3,
                                 max_features=0.5,
                                 random_state=42)
Example #35
0
def trainGBM(train, target, test, test_stripped):
    gbm = GradientBoostingClassifier()
    gbm.fit(train, target)
    prediction = [[test[index][0], x] for index, x in enumerate(gbm.predict(test_stripped))]
    return prediction
Example #36
0
y_predXgb = xgbClassifier.predict(train)
y_predXgbt = xgbClassifier.predict(test)

cm = confusion_matrix(y_test, y_predXgb)

from sklearn.neighbors import KNeighborsClassifier
KNclassifier = KNeighborsClassifier(n_neighbors=20, metric='minkowski', p=2)
KNclassifier.fit(train, y)
y_predKN = KNclassifier.predict(train)
y_predKNt = KNclassifier.predict(test)

cm = confusion_matrix(y_test, y_predKN)

from sklearn.ensemble import GradientBoostingClassifier
gdr = GradientBoostingClassifier(n_estimators=1000,
                                 random_state=42,
                                 learning_rate=0.02,
                                 max_depth=2)
gdr.fit(train, y)
y_predgrd = gdr.predict(train)
y_predgrdt = gdr.predict(test)

cm = confusion_matrix(y_test, y_predgrd)

stacked_prediction1 = np.column_stack((y_pred, y_predXgb, y_predKN, y_predgrd))
stacked_predictionTest1 = np.column_stack(
    (y_predt, y_predXgbt, y_predKNt, y_predgrdt))

xgb_metal_Classifier = XGBClassifier(n_estimators=10,
                                     learning_rate=0.1,
                                     random_state=42)
xgb_metal_Classifier.fit(stacked_prediction1, y)
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.5640644334722438
exported_pipeline = GradientBoostingClassifier(learning_rate=0.01, max_depth=3, max_features=0.6000000000000001, min_samples_leaf=20, min_samples_split=18, n_estimators=100, subsample=0.4)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Example #38
0
y = titanic["Survived"].values

# Split data in a train and a validation set
X_train, X_val, y_train, y_val = train_test_split(X,
                                                  y,
                                                  test_size=.2,
                                                  random_state=42)

# Initialization of the classificators I whish to try
clfs = []
clfs.append(LogisticRegression())
clfs.append(SVC())
clfs.append(KNeighborsClassifier(n_neighbors=3))
clfs.append(DecisionTreeClassifier())
clfs.append(RandomForestClassifier())
clfs.append(GradientBoostingClassifier())

mean_clfs = []
std_clfs = []
validation_score = []

# Cicle on the classifiers. For each classifier we look for cross validation accuracy score.
# We save the accuracy on the validation set as well
for name, classifier in zip(clfs_name, clfs):
    scores = cross_val_score(classifier,
                             X_train,
                             y_train,
                             cv=7,
                             scoring="accuracy")
    print('---------------------------------')
    print(name, ':')
def get_accuracies(data):

    X_train, X_test, y_train, y_test = get_balanced_data(data)

    seed = 1
    rfc = RandomForestClassifier(bootstrap=True,
                                 max_depth=10,
                                 max_features='auto',
                                 min_samples_leaf=2,
                                 min_samples_split=10,
                                 n_estimators=500)

    rfc2 = RandomForestClassifier(bootstrap=False,
                                  max_depth=2,
                                  max_features='auto',
                                  min_samples_leaf=5,
                                  min_samples_split=20,
                                  n_estimators=100)

    gbm = GradientBoostingClassifier(min_samples_split=25,
                                     min_samples_leaf=25,
                                     loss='deviance',
                                     learning_rate=0.1,
                                     max_depth=5,
                                     max_features='auto',
                                     criterion='friedman_mse',
                                     n_estimators=100)

    def baseline_model(optimizer='adam', learn_rate=0.01):
        model = Sequential()
        model.add(Dense(100, input_dim=X_train.shape[1], activation='relu'))
        model.add(
            Dense(50, activation='relu')
        )  # 8 is the dim/ the number of hidden units (units are the kernel)
        model.add(Dense(2, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])
        return model

    keras = KerasClassifier(build_fn=baseline_model,
                            batch_size=32,
                            epochs=100,
                            verbose=0,
                            optimizer='Adam')

    outer_cv = KFold(n_splits=5, shuffle=True, random_state=seed)

    svm = SVC(gamma="scale", probability=True, kernel='rbf', C=0.5)

    models = [('GBM', gbm), ('RFC', rfc), ('RFC2', rfc2), ('Keras', keras),
              ('SVM', svm)]

    results = []
    names = []
    scoring = 'accuracy'

    accuracy = []
    for name, model in models:
        cv_results = cross_val_score(model,
                                     X_train,
                                     y_train,
                                     cv=outer_cv,
                                     scoring=scoring)
        results.append(cv_results)
        names.append(name)
        # msg = "Cross-validation Accuracy %s: %f (+/- %f )" % (name, cv_results.mean() * 100, cv_results.std() * 100)
        # print(msg)
        model.fit(X_train, y_train)
        # print('Test set accuracy: {:.2f}'.format(model.score(X_test, y_test) * 100), '%')
        # accuracy.append(name)
        accuracy.append(model.score(X_test, y_test))
    return accuracy
# Make predictions using Extra Tress Classifier + 0.5 subset as it gave the best estimated performance

max_depth = 11

#Obtain the list of indexes for the required model
indexes = []
for trans,name,X,X_val,v,cols_list,rem_list,rank_list,i_cols_list,i_rem_list in X_all_add:
    if v == 0.5:
        if trans == 'Orig':
            indexes = i_cols_list
            break

from sklearn.ensemble import GradientBoostingClassifier

#Best model definition
best_model = GradientBoostingClassifier(max_depth=max_depth, random_state=seed)
best_model.fit(X_orig[:,indexes],Y)

#Read test dataset
dataset_test = pandas.read_csv("../input/test.csv")
#Drop unnecessary columns
ID = dataset_test['Id']
dataset_test.drop('Id',axis=1,inplace=True)
dataset_test.drop(rem,axis=1,inplace=True)
X_test = dataset_test.values

#Make predictions using the best model
predictions = best_model.predict(X_test[:,indexes])
# Write submissions to output file in the correct format
with open("submission.csv", "w") as subfile:
    subfile.write("Id,Cover_Type\n")
Example #41
0
#                        ('cntr_brand', Pipeline([
#                            ('group_col_selector', ColumnSelector(['phone_brand'])),
#                            ('print_data2', PrintTransfrmer()),
#                            ('cnt', GroupCntTransfrmer()),
#                            ])),
#                    ],
#                    transformer_weights={
##                         weight components in FeatureUnion
#                        'cntr_device': 1.0,
#                        'cntr_brand': 1.0,
#                    },
#                    )
#                ),
                ('print_data', PrintTransfrmer()),
                ('estimators', FeatureUnion([
                    ('gbc', GradientBoostingClassifier()),
                    ('rf', RandomForestClassifier()),
                    ])
                ),
                ('ensambler', LogisticRegression()),
                ])

pipe_params = {#'feature_union__transformer_weights':[[1,1], [4,1], [1,4]],
               'estimators__gbc__n_estimators': [100, 500, 1500],
               'estimators__rf__n_estimators': [100, 500, 1500],
               'ensambler__C': [10, 1, 0.1],
                }

if __name__=='__main__':
    all_data = dataset()
    all_data.data_wrangling(code_testing=False)
Example #42
0
                               min_weight_fraction_leaf=0.0,
                               presort=False,
                               random_state=None,
                               splitter='best')

modle.fit(titanic[predictors], titanic["Survived"])
kf = cross_validation.KFold(titanic.shape[0], 3, random_state=1)
scores = cross_validation.cross_val_score(modle,
                                          titanic[predictors],
                                          titanic["Survived"],
                                          cv=kf)
print("DT估计的准确率为%f" % scores.mean())

#GBDT
gb_clf = GradientBoostingClassifier(n_estimators=50,
                                    max_depth=3,
                                    random_state=1)
gb_clf.fit(titanic[predictors], titanic["Survived"])
kf = cross_validation.KFold(titanic.shape[0], 3, random_state=1)
scores = cross_validation.cross_val_score(gb_clf,
                                          titanic[predictors],
                                          titanic["Survived"],
                                          cv=kf)
print("GBDT估计的准确率为%f" % scores.mean())

# KNN
knn = neighbors.KNeighborsClassifier(algorithm='kd_tree', n_neighbors=10)
knn.fit(titanic[predictors], titanic["Survived"])
kf = cross_validation.KFold(titanic.shape[0], 5, random_state=1)
scores = cross_validation.cross_val_score(knn,
                                          titanic[predictors],
Example #43
0
def modell(X_org, y_org, test_x):
    n_folds = 5
    verbose = True
    shuffle = False

    X = X_org
    y = y_org
    X_submission = test_x
    #X_submission = X_org

    if shuffle:
        idx = np.random.permutation(y.size)
        X = X[idx]
        y = y[idx]
    skf = StratifiedKFold(n_splits=5, random_state=20, shuffle=True)
    a=list(skf.split(X, y))
    
#    skf = StratifiedKFold(y=y, n_folds=n_folds)

    clfs = [
        #RandomForestClassifier().set_params(**INITIAL_PARAMS.get("RFC:one", {})),
        ExtraTreesClassifier().set_params(**INITIAL_PARAMS.get("ETC:one", {})),
        GradientBoostingClassifier().set_params(**INITIAL_PARAMS.get("GBC:one", {})),
        #LogisticRegression().set_params(**INITIAL_PARAMS.get("LR:one", {})),
        xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:two", {})),
        #xgb.XGBClassifier().set_params(**INITIAL_PARAMS.get("XGBC:one", {})),
        lgb.LGBMClassifier().set_params(**INITIAL_PARAMS.get("LGB:one", {}))
        ]

    print ("Creating train and test sets for blending.")

    dataset_blend_train = np.zeros((X.shape[0], len(clfs)))
    dataset_blend_test = np.zeros((X_submission.shape[0], len(clfs)))

    for j, clf in enumerate(clfs):
        print (j, clf)
        dataset_blend_test_j = np.zeros((X_submission.shape[0], len(a)))
        for i, (train, test) in enumerate(a):
            print ("Fold", i)
            X_train = X[train]
            y_train = y[train]
            X_test = X[test]
            y_test = y[test]
            clf.fit(X_train, y_train)
            y_submission = clf.predict_proba(X_test)[:,1]
            dataset_blend_train[test, j] = y_submission
            dataset_blend_test_j[:, i] = clf.predict_proba(X_submission)[:,1]
        dataset_blend_test[:,j] = dataset_blend_test_j.mean(1)

    print ("Blending.")
    clf = LogisticRegression(C=2, penalty='l2', class_weight='balanced', n_jobs=-1)
#    clf = linear_model.RidgeCV(
#            alphas=np.linspace(0, 200), cv=LM_CV_NUM)
    #clf = GradientBoostingClassifier(learning_rate=0.02, subsample=0.5, max_depth=6, n_estimators=100)
    clf.fit(dataset_blend_train, y)
    y_submission = clf.predict(dataset_blend_test)
    x_submission = clf.predict(dataset_blend_train)
#    final_model = LinearRegression()
#final_model.fit(stacked_train, y_train)
#test_prediction = final_model.predict(stacked_test)

    print ("Linear stretch of predictions to [0,1]")
       
    print ("blend result")
    #save_submission.to_csv(r'C:\Users\Administrator\Desktop\da\su.csv', index=False)
    return y_submission,dataset_blend_train,dataset_blend_test,x_submission
#https://www.kaggle.com/rblcoder/learning-bayes-search-optimization
#https://scikit-optimize.github.io/#skopt.BayesSearchCV
# uses baysian optimization to find model parameters

from skopt import BayesSearchCV
import pandas as pd
from skopt.space import Real, Categorical, Integer

#estimator = GradientBoostingClassifier(n_estimators=100,
#                                   max_depth=6,
#                                   min_samples_split=2,
#                                   min_samples_leaf=0.001,
#                                   subsample=0.5,
#                                   learning_rate=0.001)

estimator = GradientBoostingClassifier()

search_spaces = {
    'n_estimators': Integer(100, 2000),
    'max_depth': Integer(6, 15),
    'min_samples_split': Integer(2, 20),
    'min_samples_leaf': Integer(1, 20),
    'subsample': Real(0.5, 1),
    'learning_rate': Real(0.001, 0.2)
}

opt = BayesSearchCV(estimator,
                    search_spaces,
                    n_iter=20,
                    scoring='roc_auc',
                    n_jobs=-1,
Example #45
0
clf = ExtraTreesClassifier()
clf.fit(x_train, y_train)

print("ExtraTrees classifier")
print(clf.score(x_test, y_test))
print("\n")

#GradientBoosting Classifier

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=42)

clf = GradientBoostingClassifier()
clf.fit(x_train, y_train)

print("GradientBoostingClassifier")
print(clf.score(x_test, y_test))
print("\n")

#Trying other classifier

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=42)

bagging = BaggingClassifier(KNeighborsClassifier(),
                            max_samples=0.5,
Example #46
0
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.kernel_approximation import RBFSampler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Average CV score on the training set was:0.9822134387351777
exported_pipeline = make_pipeline(
    RBFSampler(gamma=0.30000000000000004),
    GradientBoostingClassifier(learning_rate=0.01,
                               max_depth=10,
                               max_features=0.7000000000000001,
                               min_samples_leaf=11,
                               min_samples_split=11,
                               n_estimators=100,
                               subsample=0.3))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
# Same for test
x_test, y_test = test_df[['Message', 'Status']].values.T
y_test = y_test.astype('int')
x_test = x_test.astype('str')

print(x_test.shape)
vect = CountVectorizer(min_df=2, ngram_range=(2,2))
X_train = vect.fit(x_train).transform(x_train)
print(X_train[1].toarray())
X_test = vect.transform(x_test)

print('Len of vocabulary is {0}'.format(len(vect.vocabulary_)))
print(len(vect.get_feature_names()))

param_grid = {'n_estimators':[200,100, 50],'max_depth':[5,6,7,8],'min_samples_leaf':[10,50,100],'max_features':['sqrt','log2']}
grid = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5)
grid.fit(X_train, y_train)
clf = grid.best_estimator_
print(grid.best_params_)
clf.fit(X_train,y_train)

# For test set
y_test_pred = clf.predict(X_test)
recall = recall_score(y_true=(y_test), y_pred=y_test_pred)
precision = precision_score(y_true=(y_test), y_pred=y_test_pred)
print('Recall of test set: {0}]'.format(recall))
print('Precision of test set :{0}'.format(precision))

# For training set
y_train_pred = clf.predict(X_train)
recall = recall_score(y_true=(y_train), y_pred=y_train_pred)
def create_and_save_model(X, y):

    clf1 = XGBClassifier(learning_rate=0.1,
                         n_estimators=1000,
                         max_depth=5,
                         min_child_weight=1,
                         gamma=0,
                         subsample=0.8,
                         colsample_bytree=0.8,
                         objective='binary:logistic',
                         nthread=4,
                         scale_pos_weight=0.2,
                         seed=27,
                         reg_alpha=0.4,
                         reg_lambda=1,
                         early_stopping_rounds=50,
                         show_progress=True)

    clf2 = AdaBoostClassifier(n_estimators=150)

    # initialize the base classifier
    base_cls = DecisionTreeClassifier()

    # no. of base classifier
    num_trees = 200

    # bagging classifier
    clf3 = BaggingClassifier(base_estimator=base_cls,
                             n_estimators=num_trees,
                             random_state=8,
                             n_jobs=-1)

    clf4 = RandomForestClassifier(bootstrap=True,
                                  class_weight={
                                      0: 2.5,
                                      1: 1
                                  },
                                  criterion='entropy',
                                  max_depth=60,
                                  max_features="auto",
                                  max_leaf_nodes=50,
                                  min_impurity_decrease=0.0,
                                  min_impurity_split=None,
                                  min_samples_leaf=5,
                                  min_samples_split=6,
                                  min_weight_fraction_leaf=0.0,
                                  n_estimators=200,
                                  n_jobs=-1,
                                  oob_score=True,
                                  random_state=10,
                                  verbose=1,
                                  warm_start=False)

    params = {
        'n_estimators': 200,
        'max_depth': 20,
        'subsample': 0.6,
        'learning_rate': 0.01,
        'min_samples_leaf': 1,
        'random_state': 3,
        'loss': 'exponential',
        'max_features': 'auto',
        'verbose': 1
    }  #'ccp_alpha': 0.04
    clf5 = GradientBoostingClassifier(**params)

    estimators = [('xgb', clf1), ('abc', clf2), ('bc', clf3), ('rf', clf4),
                  ('gbc', clf5)]

    stack_estimator = XGBClassifier(learning_rate=0.1,
                                    n_estimators=300,
                                    max_depth=5,
                                    min_child_weight=1,
                                    gamma=0,
                                    subsample=0.8,
                                    colsample_bytree=0.8,
                                    objective='binary:logistic',
                                    nthread=40,
                                    scale_pos_weight=1,
                                    seed=27,
                                    reg_alpha=0,
                                    reg_lambda=1,
                                    early_stopping_rounds=50,
                                    show_progress=True)

    model = StackingClassifier(estimators=estimators,
                               final_estimator=stack_estimator,
                               n_jobs=-1,
                               cv=5,
                               verbose=1)

    model.fit(X, y)

    file_name = 'model_final.pkl'
    joblib.dump(model, file_name)

    return file_name
def get_classifier(method,
                   mode,
                   max_features=None,
                   n_estimators=None,
                   learning_rate=None,
                   random_state=None,
                   min_cases_for_training=30,
                   max_depth=None,
                   subsample=None,
                   colsample_bytree=None):

    if method == "xgb" and mode == "regr":
        return ClassifierWrapper(cls=XGBRegressor(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            subsample=subsample,
            max_depth=max_depth,
            colsample_bytree=colsample_bytree,
            n_jobs=-1,
            random_state=random_state),
                                 min_cases_for_training=min_cases_for_training,
                                 mode=mode)

    elif method == "xgb" and mode == "class":
        return ClassifierWrapper(cls=XGBClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            subsample=subsample,
            max_depth=max_depth,
            colsample_bytree=colsample_bytree,
            n_jobs=-1,
            random_state=random_state),
                                 min_cases_for_training=min_cases_for_training,
                                 mode=mode)

    elif method == "rf" and mode == "regr":
        return ClassifierWrapper(cls=RandomForestRegressor(
            n_estimators=n_estimators,
            max_features=max_features,
            n_jobs=-1,
            random_state=random_state),
                                 min_cases_for_training=min_cases_for_training,
                                 mode=mode)

    elif method == "rf" and mode == "class":
        return ClassifierWrapper(cls=RandomForestClassifier(
            n_estimators=n_estimators,
            max_features=max_features,
            n_jobs=-1,
            random_state=random_state),
                                 min_cases_for_training=min_cases_for_training,
                                 mode=mode)

    elif method == "gbm" and mode == "regr":
        return ClassifierWrapper(cls=GradientBoostingRegressor(
            n_estimators=n_estimators,
            max_features=max_features,
            learning_rate=learning_rate,
            random_state=random_state),
                                 min_cases_for_training=min_cases_for_training,
                                 mode=mode)

    elif method == "gbm" and mode == "class":
        return ClassifierWrapper(cls=GradientBoostingClassifier(
            n_estimators=n_estimators,
            max_features=max_features,
            learning_rate=learning_rate,
            random_state=random_state),
                                 min_cases_for_training=min_cases_for_training,
                                 mode=mode)

    elif method == "dt" and mode == "regr":
        return ClassifierWrapper(cls=DecisionTreeRegressor(
            max_depth=max_depth,
            max_features=max_features,
            random_state=random_state),
                                 min_cases_for_training=min_cases_for_training,
                                 mode=mode)

    elif method == "dt" and mode == "class":
        return ClassifierWrapper(cls=DecisionTreeClassifier(
            max_depth=max_depth,
            max_features=max_features,
            random_state=random_state),
                                 min_cases_for_training=min_cases_for_training,
                                 mode=mode)

    else:
        print("Invalid classifier type")
        return None
Example #50
0
}
"""          

paramDist = {'n_estimators': sp_randint(50,100),
#             'criterion': ['gini'],
             'max_features':['auto'],
             'max_depth': scipy.stats.expon(scale=5),
#             'min_samples_split':scipy.stats.expon(scale=2),
             'min_samples_leaf':scipy.stats.expon(scale=1)}
"""
paramReg = {'penalty': ['l2'], 'C': [0.1, 0.01, 0.001, 1]}

paramSVC = {'kernel': ['rbf'], 'C': [0.1, 0.01, 0.001, 1]}

Rforest = RandomForestClassifier(class_weight='subsample')
Gradboost = GradientBoostingClassifier()
LogReg = LogisticRegression(class_weight='auto')
SVMCl = SVC(class_weight='auto')
metric = roc_auc_score

grid_search = GridSearchCV(SVMCl,
                           cv=3,
                           param_grid=paramSVC,
                           n_jobs=4,
                           pre_dispatch='1*n_jobs',
                           scoring='precision')
grid_search = GridSearchCV(LogReg,
                           cv=3,
                           param_grid=paramReg,
                           n_jobs=4,
                           pre_dispatch='1*n_jobs',
Example #51
0
# Data
X = pd.read_csv("X_feat_sel.csv")
y = pd.read_csv("y.csv", header=None, names='y')


#########################################################
import  numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier


#########################################################
# tune tree
gbX = GradientBoostingClassifier()

tree_grid = {'max_depth':np.arange(1,12,1), 
             'min_samples_split':[2,10,25,50,100,150,250],
             'min_samples_leaf':[1,5,10,25,50,100,150,250],
             'max_leaf_nodes':[None, 5, 10, 20, 40, 80],
             'max_features':np.arange(2,10),
             'learning_rate':10 ** np.arange(-2,1, dtype=np.float),
             'n_estimators':[10,25,50,100,200]}
              
gsX = RandomizedSearchCV(gbX, tree_grid, cv=2, n_iter=200)

gsX.grid_scores_, gsX.best_params_, gsX.best_score_


#########################################################
                                        height).astype(int).transpose(1, 0)
RF_predict_prob = RF.predict_proba(data_all)
# Post-processing using Graph-Cut
Seg_Label, seg_accuracy = Post_Processing(RF_predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
print('(Random Forest) Train_Acc=%.3f, Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
      % (RF.score(X_train,y_train),RF.score(X_test,y_test),\
         seg_accuracy, (time.time()-start_time)))
# draw classification map
draw(GT_Label, RF_Label, Seg_Label, train_map, test_map)
print('--------------------------------------------------------------------')

# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
start_time = time.time()
GBC = GradientBoostingClassifier(n_estimators=300,
                                 learning_rate=0.1).fit(X_train, y_train)
GBC_Label = GBC.predict(data_all).reshape(width,
                                          height).astype(int).transpose(1, 0)
GBC_predict_prob = GBC.predict_proba(data_all)
# Post-processing using Graph-Cut
Seg_Label, seg_accuracy = Post_Processing(GBC_predict_prob,height,width,\
                                          num_classes,y_test,test_indexes)
print('(Gradient Boosting) Train_Acc=%.3f, Cla_Acc=%.3f, Seg_Acc=%.3f(Time_cost=%.3f)'\
      % (GBC.score(X_train,y_train),GBC.score(X_test,y_test),\
         seg_accuracy, (time.time()-start_time)))
# draw classification map
draw(GT_Label, GBC_Label, Seg_Label, train_map, test_map)
print('--------------------------------------------------------------------')

# Neural Network - MLP
from sklearn.neural_network import MLPClassifier
Example #53
0
    
    ## TODO: Add any additional arguments that you will need to pass into your model
    
    # args holds all passed-in arguments
    args = parser.parse_args()

    # Read in csv training file
    training_dir = args.data_dir
    train_data = pd.read_csv(os.path.join(training_dir, "train.csv"), header=None, names=None)

    # Labels are in the first column
    train_y = train_data.iloc[:,0]
    train_X = train_data.iloc[:,1:]
    
    
    ## --- Your code here --- ##
    

    ## Define a model 
    model = GradientBoostingClassifier()
    
    
    ## Train the model
    model.fit(train_X, train_y)
    
    
    ## --- End of your code  --- ##
    

    # Save the trained model
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.30,
                                                    random_state=state)

#Passing different learning rates to find best learning_rate.

from sklearn.ensemble import GradientBoostingClassifier

lr_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for learning_rate in lr_list:
    gb_clf = GradientBoostingClassifier(n_estimators=10,
                                        learning_rate=learning_rate,
                                        max_features=2,
                                        max_depth=2,
                                        random_state=0)
    gb_clf.fit(X_train, y_train)

    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
        gb_clf.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        gb_clf.score(X_test, y_test)))

from sklearn.metrics import classification_report, confusion_matrix

gb_clf2 = GradientBoostingClassifier(n_estimators=10,
                                     learning_rate=0.1,
                                     max_features=2,
Example #55
0
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.ensemble import GradientBoostingClassifier

malData = pd.read_csv("C:/Users/parsh/Desktop/MalwareData.csv", sep="|")

begn = malData[0:41323].drop(["legitimate"], axis=1)
mal = malData[41323::].drop(["legitimate"], axis=1)

data_in = malData.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = malData['legitimate'].values
extratrees = ExtraTreesClassifier().fit(data_in, labels)
select = SelectFromModel(extratrees, prefit=True)
data_new = select.transform(data_in)

begn_train, begn_test, mal_train, mal_test = train_test_split(data_new,
                                                              labels,
                                                              test_size=0.2)

grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(begn_train, mal_train)
print(grad_boost.score(begn_test, mal_test) * 100)
Example #56
0
#ensemble models

models = {}

print "Training on all features"
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=1010)

models['RFC'] = RandomForestClassifier(n_estimators=300)
models['XGB'] = xgb.XGBClassifier(max_depth=3,
                                  n_estimators=300,
                                  learning_rate=0.05)
models['GBC'] = GradientBoostingClassifier()
models['ABC'] = AdaBoostClassifier()
models['ETC'] = ExtraTreesClassifier()

for name, model in models.iteritems():
    model.fit(X_train, y_train)
    print name
    print classification_report(y_test, model.predict(X_test))
    print "Accuracy: ", accuracy_score(y_test, model.predict(X_test))
    print '\n'

feature_importances = pd.DataFrame()

for name, model in models.iteritems():
    df = pd.DataFrame(data=model.feature_importances_,
                      index=X_test.columns,
Example #57
0
def gradient_boosting_classifier(train_x, train_y):  
    from sklearn.ensemble import GradientBoostingClassifier  
    model = GradientBoostingClassifier(n_estimators=200)  
    model.fit(train_x, train_y)  
    return model  
Example #58
0
def predefined_estimators(estimator, random_state, n_jobs, p):
    """
    Provides the classifiers and parameters using by the module

    Parameters
    -----------
    estimator : str
        Name of scikit learn estimator.

    random_state : Any number
        Seed to use in randomized components.

    n_jobs : int
        Number of processing cores to use.

    p : dict
        Classifier setttings (keys) and values.

    Returns
    -------
    clf : object
        Scikit-learn classifier object

    mode : str
        Flag to indicate whether classifier performs classification or
        regression.
    """
    try:
        from sklearn.experimental import enable_hist_gradient_boosting
    except ImportError:
        pass

    from sklearn.linear_model import (
        LogisticRegression,
        LinearRegression,
        SGDRegressor,
        SGDClassifier,
    )
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.naive_bayes import GaussianNB
    from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
    from sklearn.ensemble import (
        RandomForestClassifier,
        RandomForestRegressor,
        ExtraTreesClassifier,
        ExtraTreesRegressor,
    )
    from sklearn.ensemble import (GradientBoostingClassifier,
                                  GradientBoostingRegressor)
    from sklearn.svm import SVC, SVR
    from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
    from sklearn.neural_network import MLPClassifier, MLPRegressor

    estimators = {
        "SVC":
        SVC(C=p["C"], probability=True, random_state=random_state),
        "SVR":
        SVR(C=p["C"], epsilon=p["epsilon"]),
        "LogisticRegression":
        LogisticRegression(
            C=p["C"],
            solver="liblinear",
            random_state=random_state,
            multi_class="auto",
            n_jobs=1,
            fit_intercept=True,
        ),
        "LinearRegression":
        LinearRegression(n_jobs=n_jobs, fit_intercept=True),
        "SGDClassifier":
        SGDClassifier(
            penalty=p["penalty"],
            alpha=p["alpha"],
            l1_ratio=p["l1_ratio"],
            n_jobs=n_jobs,
            random_state=random_state,
        ),
        "SGDRegressor":
        SGDRegressor(
            penalty=p["penalty"],
            alpha=p["alpha"],
            l1_ratio=p["l1_ratio"],
            random_state=random_state,
        ),
        "DecisionTreeClassifier":
        DecisionTreeClassifier(
            max_depth=p["max_depth"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
        ),
        "DecisionTreeRegressor":
        DecisionTreeRegressor(
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
        ),
        "RandomForestClassifier":
        RandomForestClassifier(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "RandomForestRegressor":
        RandomForestRegressor(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "ExtraTreesClassifier":
        ExtraTreesClassifier(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            n_jobs=n_jobs,
            bootstrap=True,
            oob_score=True,
        ),
        "ExtraTreesRegressor":
        ExtraTreesRegressor(
            n_estimators=p["n_estimators"],
            max_features=p["max_features"],
            min_samples_leaf=p["min_samples_leaf"],
            random_state=random_state,
            bootstrap=True,
            n_jobs=n_jobs,
            oob_score=True,
        ),
        "GradientBoostingClassifier":
        GradientBoostingClassifier(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "GradientBoostingRegressor":
        GradientBoostingRegressor(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "HistGradientBoostingClassifier":
        GradientBoostingClassifier(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "HistGradientBoostingRegressor":
        GradientBoostingRegressor(
            learning_rate=p["learning_rate"],
            n_estimators=p["n_estimators"],
            max_depth=p["max_depth"],
            min_samples_leaf=p["min_samples_leaf"],
            subsample=p["subsample"],
            max_features=p["max_features"],
            random_state=random_state,
        ),
        "MLPClassifier":
        MLPClassifier(
            hidden_layer_sizes=p["hidden_layer_sizes"],
            alpha=p["alpha"],
            random_state=random_state,
        ),
        "MLPRegressor":
        MLPRegressor(
            hidden_layer_sizes=p["hidden_layer_sizes"],
            alpha=p["alpha"],
            random_state=random_state,
        ),
        "GaussianNB":
        GaussianNB(),
        "LinearDiscriminantAnalysis":
        LinearDiscriminantAnalysis(),
        "QuadraticDiscriminantAnalysis":
        QuadraticDiscriminantAnalysis(),
        "KNeighborsClassifier":
        KNeighborsClassifier(n_neighbors=p["n_neighbors"],
                             weights=p["weights"],
                             n_jobs=n_jobs),
        "KNeighborsRegressor":
        KNeighborsRegressor(n_neighbors=p["n_neighbors"],
                            weights=p["weights"],
                            n_jobs=n_jobs),
    }

    # define classifier
    model = estimators[estimator]

    # classification or regression
    if (estimator == "LogisticRegression" or estimator == "SGDClassifier"
            or estimator == "MLPClassifier"
            or estimator == "DecisionTreeClassifier"
            or estimator == "RandomForestClassifier"
            or estimator == "ExtraTreesClassifier"
            or estimator == "GradientBoostingClassifier"
            or estimator == "HistGradientBoostingClassifier"
            or estimator == "GaussianNB"
            or estimator == "LinearDiscriminantAnalysis"
            or estimator == "QuadraticDiscriminantAnalysis"
            or estimator == "SVC" or estimator == "KNeighborsClassifier"):
        mode = "classification"
    else:
        mode = "regression"

    return (model, mode)
Example #59
0
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

# Score on the training set was:0.9940458797222709
exported_pipeline = make_pipeline(
    RFE(estimator=ExtraTreesClassifier(criterion="entropy",
                                       max_features=0.55,
                                       n_estimators=100),
        step=0.15000000000000002),
    StackingEstimator(
        estimator=GradientBoostingClassifier(learning_rate=1.0,
                                             max_depth=1,
                                             max_features=0.8500000000000001,
                                             min_samples_leaf=6,
                                             min_samples_split=5,
                                             n_estimators=100,
                                             subsample=0.25)),
    StackingEstimator(
        estimator=LogisticRegression(C=1.0, dual=False, penalty="l1")),
    StackingEstimator(
        estimator=RandomForestClassifier(bootstrap=False,
                                         criterion="gini",
                                         max_features=0.7500000000000001,
                                         min_samples_leaf=14,
                                         min_samples_split=14,
                                         n_estimators=100)),
    KNeighborsClassifier(n_neighbors=2, p=2, weights="distance"))

exported_pipeline.fit(training_features, training_target)
Example #60
0
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier,ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from mlxtend.classifier import StackingClassifier,EnsembleVoteClassifier
from sklearn import cross_validation
from sklearn.cross_validation import KFold
clf1 = KNeighborsClassifier(4)
clf2 = DecisionTreeClassifier(criterion="gini")
clf3 = LogisticRegression()
lr = LogisticRegression()
gb =GradientBoostingClassifier()
classifiers = [
    # StackingClassifier(classifiers=[clf1, clf2, clf3],
    #                       meta_classifier=lr),
    # EnsembleVoteClassifier(clfs=[clf1, clf2, clf3],voting='soft', verbose=0),
    # SVC(kernel="linear", C=0.025),
    ExtraTreesClassifier(n_estimators=150, criterion="entropy", max_depth=None,
                         min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.,
                         max_features="auto", max_leaf_nodes=None, min_impurity_split=1e-7,
                         bootstrap=False, oob_score=False, n_jobs=1, random_state=410,
                         verbose=0, warm_start=False, class_weight=None),
    RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                           max_depth=None, max_features='auto', max_leaf_nodes=None,
                           min_impurity_split=1e-07, min_samples_leaf=1,
                           min_samples_split=2, min_weight_fraction_leaf=0.0,
                           n_estimators=70, n_jobs=1, oob_score=True, random_state=410,