def fit(self, X, y, sample_weight=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Training vector, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape (n_samples,)
            Target vector relative to X. Has to follow the convention 0 for
            normal data, 1 for anomalies.

        sample_weight : array-like, shape (n_samples,) optional
            Array of weights that are assigned to individual samples, typically
            the amount in case of transactions data. Used to grow regression
            trees producing further rules to be tested.
            If not provided, then each sample is given unit weight.

        Returns
        -------
        self : object
            Returns self.
        """

        X, y = check_X_y(X, y)
        check_classification_targets(y)
        self.n_features_ = X.shape[1]

        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        #错误输入检测
        if n_classes < 2:
            raise ValueError("This method needs samples of at least 2 classes"
                             " in the data, but the data contains only one"
                             " class: %r" % self.classes_[0])

        if not isinstance(self.max_depth_duplication, int) \
                and self.max_depth_duplication is not None:
            raise ValueError("max_depth_duplication should be an integer")
        if not set(self.classes_) == set([0, 1]):
            warn("Found labels %s. This method assumes target class to be"
                 " labeled as 1 and normal data to be labeled as 0. Any label"
                 " different from 0 will be considered as being from the"
                 " target class." % set(self.classes_))
            y = (y > 0)

        # ensure that max_samples is in [1, n_samples]:
        n_samples = X.shape[0]

        if isinstance(self.max_samples, str):
            raise ValueError('max_samples (%s) is not supported.'
                             'Valid choices are: "auto", int or'
                             'float' % self.max_samples)

        elif isinstance(self.max_samples, INTEGER_TYPES):
            if self.max_samples > n_samples:
                warn("max_samples (%s) is greater than the "
                     "total number of samples (%s). max_samples "
                     "will be set to n_samples for estimation." %
                     (self.max_samples, n_samples))
                max_samples = n_samples
            else:
                max_samples = self.max_samples
        else:  # float
            if not (0. < self.max_samples <= 1.):
                raise ValueError("max_samples must be in (0, 1], got %r" %
                                 self.max_samples)
            max_samples = int(self.max_samples * X.shape[0])

        self.max_samples_ = max_samples

        self.rules_ = {}
        self.estimators_ = []
        self.estimators_samples_ = []
        self.estimators_features_ = []

        # default columns names :
        # 此处先将属性名替换为默认的名字
        feature_names_ = [
            BASE_FEATURE_NAME + x for x in np.arange(X.shape[1]).astype(str)
        ]
        if self.feature_names is not None:
            self.feature_dict_ = {
                BASE_FEATURE_NAME + str(i): feat
                for i, feat in enumerate(self.feature_names)
            }
        #若没有提供属性名则使用默认产生的属性名
        else:
            self.feature_dict_ = {
                BASE_FEATURE_NAME + str(i): feat
                for i, feat in enumerate(feature_names_)
            }
        self.feature_names_ = feature_names_

        clfs = []
        regs = []
        #每一个集成学习器中基学习器的深度
        self._max_depths = self.max_depth \
            if isinstance(self.max_depth, Iterable) else [self.max_depth]

        for max_depth in self._max_depths:
            bagging_clf = BaggingClassifier(
                base_estimator=DecisionTreeClassifier(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            bagging_reg = BaggingRegressor(
                base_estimator=DecisionTreeRegressor(
                    max_depth=max_depth,
                    max_features=self.max_features,
                    min_samples_split=self.min_samples_split),
                n_estimators=self.n_estimators,
                max_samples=self.max_samples_,
                max_features=self.max_samples_features,
                bootstrap=self.bootstrap,
                bootstrap_features=self.bootstrap_features,
                # oob_score=... XXX may be added
                # if selection on tree perf needed.
                # warm_start=... XXX may be added to increase computation perf.
                n_jobs=self.n_jobs,
                random_state=self.random_state,
                verbose=self.verbose)

            clfs.append(bagging_clf)
            regs.append(bagging_reg)

        # define regression target:
        if sample_weight is not None:
            if sample_weight is not None:
                sample_weight = check_array(sample_weight, ensure_2d=False)
            weights = sample_weight - sample_weight.min()
            contamination = float(sum(y)) / len(y)
            y_reg = (pow(weights, 0.5) * 0.5 / contamination * (y > 0) - pow(
                (weights).mean(), 0.5) * (y == 0))
            y_reg = 1. / (1 + np.exp(-y_reg))  # sigmoid
        else:
            y_reg = y  # same as an other classification bagging

        for clf in clfs:
            clf.fit(X, y)
            self.estimators_ += clf.estimators_
            self.estimators_samples_ += clf.estimators_samples_
            self.estimators_features_ += clf.estimators_features_

        for reg in regs:
            reg.fit(X, y_reg)
            self.estimators_ += reg.estimators_
            self.estimators_samples_ += reg.estimators_samples_
            self.estimators_features_ += reg.estimators_features_

        rules_ = []
        for estimator, samples, features in zip(self.estimators_,
                                                self.estimators_samples_,
                                                self.estimators_features_):

            # Create mask for OOB samples
            mask = ~indices_to_mask(samples, n_samples)

            if sum(mask) == 0:
                warn("OOB evaluation not possible: doing it in-bag."
                     " Performance evaluation is likely to be wrong"
                     " (overfitting) and selected rules are likely to"
                     " not perform well! Please use max_samples < 1.")
                mask = samples
            rules_from_tree = self._tree_to_rules(
                estimator,
                np.array(self.feature_names_)[features])

            # XXX todo: idem without dataframe
            X_oob = pandas.DataFrame(
                (X[mask, :])[:, features],
                columns=np.array(self.feature_names_)[features])

            if X_oob.shape[1] > 1:  # otherwise pandas bug (cf. issue #16363)
                y_oob = y[mask]
                y_oob = np.array((y_oob != 0))

                # Add OOB performances to rules:

                rules_from_tree = [(r, self._eval_rule_perf(r, X_oob, y_oob))
                                   for r in set(rules_from_tree)]
                rules_ += rules_from_tree

        # Factorize rules before semantic tree filtering
        rules_ = [
            tuple(rule) for rule in [Rule(r, args=args) for r, args in rules_]
        ]

        # keep only rules verifying precision_min and recall_min:
        for rule, score in rules_:
            if score[0] >= self.precision_min and score[1] >= self.recall_min:
                if rule in self.rules_:
                    # update the score to the new mean
                    c = self.rules_[rule][2] + 1
                    b = self.rules_[rule][1] + 1. / c * (score[1] -
                                                         self.rules_[rule][1])
                    a = self.rules_[rule][0] + 1. / c * (score[0] -
                                                         self.rules_[rule][0])

                    self.rules_[rule] = (a, b, c)
                else:
                    self.rules_[rule] = (score[0], score[1], 1)

        self.rules_ = sorted(self.rules_.items(),
                             key=lambda x: (x[1][0], x[1][1]),
                             reverse=True)

        # Deduplicate the rule using semantic tree
        if self.max_depth_duplication is not None:
            self.rules_ = self.deduplicate(self.rules_)

        self.rules_ = sorted(self.rules_, key=lambda x: -self.f1_score(x))
        self.rules_without_feature_names_ = self.rules_

        # Replace generic feature names by real feature names
        self.rules_ = [(replace_feature_name(rule, self.feature_dict_), perf)
                       for rule, perf in self.rules_]

        return self
Exemple #2
0
# In[ ]:


import matplotlib
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier 


clf = RandomForestClassifier(n_estimators=250, n_jobs=-1)
# we use a BaggingClassifier to make 5 predictions, and average
# beacause that's what CalibratedClassifierCV do behind the scene
# and we want to compare things fairly
clfbag = BaggingClassifier(clf, n_estimators=5)
clfbag.fit(X_train, y_train)
# make predictions for test data
y_pred = clfbag.predict(X_test)

predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

#ypreds = clfbag.predict_proba(X_test)
#print "%.2f"% % log_loss(y_test, ypreds, eps=1e-15, normalize=True)


# In[ ]:
Exemple #3
0
data = pd.read_csv("subset1_w3.csv")
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=50)

n_estimators = 3

print("Developing SVM models....")
model3 = OneVsRestClassifier(
    BaggingClassifier(LinearSVC(class_weight='balanced', max_iter=100000),
                      max_samples=1.0 / n_estimators,
                      n_estimators=n_estimators))
print("Fitting SVM models....")
model3.fit(X_train, y_train)

svm_probs = model3.decision_function(X_test)
svm_auc = roc_auc_score(y_test, svm_probs)
print("SVM - Accuracy: %f" % accuracy_score(y_test, model3.predict(X_test)))
print("SVM - AUC score: %f" % svm_auc)
print(classification_report(y_test, model3.predict(X_test)))

base_fpr, base_tpr, _ = roc_curve(y_test, [1 for _ in range(len(y_test))])
svm_fpr, svm_tpr, _ = roc_curve(y_test, svm_probs)

plt.style.use('fivethirtyeight')
plt.figure(figsize=(8, 6))
Exemple #4
0
Naive Bayes is a rapid classification method. It uses the famous Bayes Theorem under the 'naive' assumption that all predictor features are independent from each other (and only related to the target variable).

Despite this oversimplification Naive Bayes classifiers are performing well in many cases. In addition, they are fast to compute and only require relatively little data to perform well.
"""

clf_bay = GaussianNB()
clf_bay.fit(X, y)
score_bay = cross_val_score(clf_bay, X, y, cv=5).mean()
print(score_bay)

# In[ ]:

#bagging
bagging = BaggingClassifier(KNeighborsClassifier(n_neighbors=2,
                                                 weights='distance'),
                            oob_score=True,
                            max_samples=0.5,
                            max_features=1.0)
clf_bag = bagging.fit(X, y)
score_bag = clf_bag.oob_score_
print(score_bag)

# In[ ]:

#Desicion Tree
clf_tree = tree.DecisionTreeClassifier(class_weight='balanced',
                                       min_weight_fraction_leaf=0.01)
clf_tree = clf_tree.fit(X, y)
score_tree = cross_val_score(clf_tree, X, y, cv=5).mean()
print(score_tree)
Exemple #5
0
    forest = forest.fit(X_train, y_train)
    proba = forest.predict_proba(X_test)
    proba = proba[:, 1]
    y_test = np.array(y_test)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, proba, pos_label=1)
    loss = metrics.auc(fpr, tpr)
    print loss
    return loss


def kfold_validation(data=train, y=y, trials=10):
    skf = cross_validation.StratifiedKFold(y, n_folds=10)
    error = 0.0
    for train_index, test_index in skf:
        X_train, X_test = data[train_index], data[test_index]
        y_train, y_test = y[train_index], y[test_index]
        error += train_and_test(X_train, X_test, y_train, y_test)
    return error / trials


score = kfold_validation()
print score

forest = BaggingClassifier(n_estimators=1000, random_state=1234)
forest = forest.fit(train, y)
proba = forest.predict_proba(test)
proba = proba[:, 1]
submission = pd.DataFrame({"bidder_id": idx, "prediction": proba})
submission.to_csv("submissions/submission_bag1.csv", index=False)
print 'Done.'
Exemple #6
0
def runf2():
    heloc = pd.read_csv(filecd, engine='python')
    df = pd.DataFrame(heloc)
    mymap = {'Good': 1, 'Bad': 0}
    df = df.applymap(lambda s: mymap.get(s) if s in mymap else s)
    without9_filter = \
    (df['ExternalRiskEstimate']!=-9) & \
    (df['MSinceOldestTradeOpen']!=-9) & \
    (df['MSinceMostRecentTradeOpen']!=-9) & \
    (df['AverageMInFile']!=-9) & \
    (df['NumSatisfactoryTrades']!=-9) & \
    (df['NumTrades60Ever2DerogPubRec']!=-9) & \
    (df['NumTrades90Ever2DerogPubRec']!=-9) & \
    (df['PercentTradesNeverDelq']!=-9) & \
    (df['MSinceMostRecentDelq']!=-9) & \
    (df['MaxDelq2PublicRecLast12M']!=-9) & \
    (df['MaxDelqEver']!=-9) & \
    (df['NumTotalTrades']!=-9) & \
    (df['NumTradesOpeninLast12M']!=-9) & \
    (df['PercentInstallTrades']!=-9) & \
    (df['MSinceMostRecentInqexcl7days']!=-9) & \
    (df['NumInqLast6M']!=-9) & \
    (df['NumInqLast6Mexcl7days']!=-9) & \
    (df['NetFractionRevolvingBurden']!=-9) & \
    (df['NetFractionInstallBurden']!=-9) & \
    (df['NumRevolvingTradesWBalance']!=-9) & \
    (df['NumInstallTradesWBalance']!=-9) & \
    (df['NumBank2NatlTradesWHighUtilization']!=-9) & \
    (df['PercentTradesWBalance']!=-9)
    dfnew = df[without9_filter]
    dfwithnan = dfnew.replace([-8, -7], np.nan)
    dfwithnan.info(buf=buf)
    txt.insert(END, buf.getvalue())
    data_new1 = dfwithnan.drop("MSinceMostRecentDelq", axis=1)
    data_new2 = dfwithnan.drop("MSinceMostRecentInqexcl7days", axis=1)
    data_new3 = dfwithnan.drop("NetFractionInstallBurden", axis=1)
    data_new = data_new3
    from sklearn.model_selection import train_test_split
    train_set, test_set = train_test_split(data_new,
                                           test_size=0.2,
                                           random_state=1)
    data_train_onlyx = train_set.drop("RiskPerformance", axis=1)
    from sklearn.preprocessing import Imputer
    imputer = Imputer(strategy="median")
    imputer.fit(data_train_onlyx)
    X = imputer.transform(data_train_onlyx)
    data_tr = pd.DataFrame(X, columns=data_train_onlyx.columns)
    X = data_tr[[
        'ExternalRiskEstimate', 'MSinceOldestTradeOpen',
        'MSinceMostRecentTradeOpen', 'AverageMInFile', 'NumSatisfactoryTrades',
        'NumTrades60Ever2DerogPubRec', 'NumTrades90Ever2DerogPubRec',
        'PercentTradesNeverDelq', 'NumTotalTrades', 'NumTradesOpeninLast12M',
        'PercentInstallTrades', 'NumInqLast6M', 'NumInqLast6Mexcl7days',
        'NetFractionRevolvingBurden', 'NumRevolvingTradesWBalance',
        'NumInstallTradesWBalance', 'NumBank2NatlTradesWHighUtilization',
        'PercentTradesWBalance', 'MaxDelq2PublicRecLast12M', 'MaxDelqEver'
    ]]
    Y = train_set['RiskPerformance']
    random.seed(36)
    Bagging_max = []
    RF_max = []
    SingleTree_max = []
    n_Bagging = []
    n_RF = []
    n_SingleTree = []
    df = pd.DataFrame()

    for tree_depth in range(1, 9, 1):
        clf_tree = tree.DecisionTreeClassifier(max_depth=tree_depth)
        clf_tree = clf_tree.fit(X, Y)
        clf_tree_scores = cross_val_score(clf_tree, X, Y, cv=5)
        base_clf = tree.DecisionTreeClassifier(
            max_depth=tree_depth)  # base classifier
        results = []
        n_range = range(1, 30, 1)
        for n in n_range:
            # bagging classifier with n bootstrapped data sets
            clf_bagging = BaggingClassifier(n_estimators=n,
                                            base_estimator=base_clf)
            scores = cross_val_score(clf_bagging, X, Y, cv=5)

            # random forest classifier with n bootstrapped data sets m=sqrt(p)
            clf_rf = RandomForestClassifier(max_features="sqrt",
                                            n_estimators=n,
                                            max_depth=tree_depth)
            clf_rf_scores = cross_val_score(clf_rf, X, Y, cv=5)

            results.append(
                (n, scores.mean(), scores.std(), clf_rf_scores.mean(),
                 clf_rf_scores.std(), clf_tree_scores.mean()))

        df_accuracy = pd.DataFrame(data=results,
                                   columns=[
                                       'n', 'Bagging accuracy',
                                       'Bagging error', 'RF accuracy',
                                       'RF error', 'Single tree'
                                   ])
        df_accuracy.index = df_accuracy['n']
        df_accuracy = df_accuracy[[
            'Bagging accuracy', 'RF accuracy', 'Single tree'
        ]]
        Bagging_max.append(max(df_accuracy['Bagging accuracy']))
        RF_max.append(max(df_accuracy['RF accuracy']))
        SingleTree_max.append(max(df_accuracy['Single tree']))
        n_Bagging.append(df_accuracy.idxmax()['Bagging accuracy'])
        n_RF.append(df_accuracy.idxmax()['RF accuracy'])
        n_SingleTree.append(df_accuracy.idxmax()['Single tree'])
    df = pd.DataFrame()
    df['RF'] = RF_max
    df['Rn'] = n_RF
    df['Bagging'] = Bagging_max
    df['Bn'] = n_Bagging
    df['CART'] = SingleTree_max
    Tree_Depth = df.index + 1
    df['Depth'] = Tree_Depth
    modelresult.insert(END, str(df))
Exemple #7
0
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier

algorithms = [
    LogisticRegression(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    SGDClassifier(loss='modified_huber'),
    SVC(probability=True),
    ComplementNB(),
    MLPClassifier(),
    KNeighborsClassifier(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    BaggingClassifier(),
    XGBClassifier()
]

if best_algo == 'LogisticRegression':
    algo = getattr(sklearn.linear_model, best_algo)()

if best_algo == 'SGDClassifier':
    algo = getattr(sklearn.linear_model, best_algo)(loss='modified_huber')

if (best_algo == 'RandomForestClassifier'
    ) or (best_algo == 'AdaBoostClassifier') or (
        best_algo == 'GradientBoostingClassifier') or (best_algo
                                                       == 'BaggingClassifier'):
    algo = getattr(sklearn.ensemble, best_algo)()
def bagging_test():
    bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5,
                                max_features=0.5)
    test_model(bagging)
dataframe = pd.read_csv('data/loan_prediction.csv')
X = dataframe.iloc[:, :-1]
y = dataframe.iloc[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=9)
# Write your code here
clf1 = LogisticRegression(random_state=9)
clf2 = DecisionTreeClassifier(random_state=9)
clf3 = DecisionTreeClassifier(max_depth=9, random_state=9)

bagging_clf1 = BaggingClassifier(clf2,
                                 n_estimators=100,
                                 max_samples=100,
                                 bootstrap=True,
                                 random_state=9,
                                 oob_score=True)
bagging_clf2 = BaggingClassifier(clf1,
                                 n_estimators=100,
                                 max_samples=100,
                                 bootstrap=True,
                                 random_state=9,
                                 oob_score=True)
bagging_clf3 = BaggingClassifier(clf3,
                                 n_estimators=100,
                                 max_samples=100,
                                 bootstrap=True,
                                 random_state=9,
                                 oob_score=True)
Exemple #10
0
# Load data from numpy file
X =  np.load("C:/Users/deyso/PycharmProjects/sound/mp3folder/npy_files_TOTAL_train/features1.npy")
t =  np.load("C:/Users/deyso/PycharmProjects/sound/mp3folder/npy_files_TOTAL_train/labels.npy")
#print(len(t))
ac=[0,0,0,0,0,0,0,0]
for i in range(0,50):
   k = []
   for p in t:
      k.append(p[i])
   y=np.array(k)


   #print(X,y)
   # Split data into training and test subsets
   X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
   models = [BaggingClassifier(), RandomForestClassifier(), AdaBoostClassifier(),
             KNeighborsClassifier(), GaussianNB(), tree.DecisionTreeClassifier(),
             svm.SVC(C=20.0, gamma=0.00001), OutputCodeClassifier(BaggingClassifier())]
   model_names = ["Bagging with DT", "Random Forest", "AdaBoost", "KNN", "Naive Bayes", "Decision Tree",
                  "Linear SVM", "OutputCodeClassifier with Linear SVM",]
   #ac=[0,0,0,0,0,0,0,0]
   count=0
   for model, name in zip(models, model_names):
      model.fit(X_train, y_train)
   # Simple SVM
   #print('fitting...')
      prediction = model.predict(X_test)
   # Print Accuracy
      acc = accuracy_score(y_test, prediction)
   #clf = SVC(C=20.0, gamma=0.00001)
   #clf.fit(X_train, y_train)
    "hotel_cluster", "date_time", "srch_ci", "srch_co", "event_date",
    "event_time"
],
                                            axis=1)

print "Train Feature shape:", exp_data_train_features.shape,
print "Train label shape:", exp_data_train_labels.shape
print "Test Feature shape:", exp_data_test_features.shape,
print "Train label shape:", exp_data_test_labels.shape,

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

clf_tre = DecisionTreeClassifier()

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_jobs=-1)

print "before fit"

bag_clf.fit(exp_data_train_features, exp_data_train_labels)

pred = bag_clf.predict(exp_data_test_features)
pred_prob = bag_clf.predict_proba(exp_data_test_features)

print pred
print pred_prob

probs = pd.DataFrame(pred_prob)
probs.columns = np.unique(exp_data_test_labels.sort_values().values)

print probs.columns
Exemple #12
0
model.add(Dense(5, activation="softmax"))
# DNN model
model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.01), metrics=['accuracy'])
# CART model
c = tree.DecisionTreeClassifier(criterion = "gini", random_state = 42, splitter="best",
                                max_depth=3, min_samples_leaf=2)
# ANN model
ann = MLPClassifier(solver='adam', alpha=1e-2, max_iter=300,
                    hidden_layer_sizes=(8), random_state=42)
# Ensemble: Random Forests
rf = RandomForestClassifier(bootstrap=True,  criterion='gini',
            max_depth=4,  max_leaf_nodes=None, max_features="auto",
            min_samples_split=2, n_estimators='warn', random_state=42,
            verbose=0, warm_start=False)
# Ensemble: Bagging
bagging = BaggingClassifier(rf, n_estimators=500, max_samples=1.0, random_state=42)
# Ensemble: Weighted Voting - not included in the report
eclf3 = VotingClassifier(estimators=[
       ('CART', c), ('ANN', ann), ('BAG', bagging)],
       voting='soft', weights=[14,3,3],
       flatten_transform=True)

# Training models
H = model.fit(x_train, y_train2, validation_data=(x_test, y_test2),
	epochs=120, batch_size=100)
c.fit(x_train, y_train)
ann.fit(x_train, y_train)
eclf3 = eclf3.fit(x_train, y_train)
bagging.fit(x_train, y_train)
rf.fit(x_train, y_train)
Exemple #13
0
parameters = {'n_estimators':[100, 200, 300, 400, 500], 'max_features':[1, 2, 3, 4, 5]}
rf = RandomForestClassifier(random_state=2)
rf_clf = GridSearchCV(rf, param_grid=parameters)
rf_clf.fit(x_train, y_train)
ytst = np.empty(shape = 1)
y_hat_test = rf_clf.predict(x_test)
for i in range(0, len(y_hat_test)) :
    ytst = np.append(ytst,y_test.values[i][0])
ytst = np.delete(ytst, 0)
print(accuracy_score(ytst, y_hat_test))
print(pd.crosstab(ytst, y_hat_test, rownames=['actual'], colnames=['preds']))

# bagging
parameters = {'n_estimators':[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
              'max_features':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
bag = BaggingClassifier(random_state=7)
bag_clf = GridSearchCV(bag, param_grid=parameters)
bag_clf.fit(x_train, y_train)
ytst = np.empty(shape = 1)
y_hat_test = bag_clf.predict(x_test)
for i in range(0, len(y_hat_test)) :
    ytst = np.append(ytst,y_test.values[i][0])
ytst = np.delete(ytst, 0)
print(accuracy_score(ytst, y_hat_test))
print(pd.crosstab(ytst, y_hat_test, rownames=['actual'], colnames=['preds']))


# boosting
parameters = {'n_estimators':[30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80],
              'learning_rate':[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.10]}
boost = AdaBoostClassifier(random_state=9)
Exemple #14
0
X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)

# Instantiate a number of our models
naive_bayes = MultinomialNB()
bag_mod = BaggingClassifier(n_estimators=200)
rf_mod = RandomForestClassifier(n_estimators=200)
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)
svm_mod = SVC()




# Fit each of the 4 models
# This might take some time to run
naive_bayes.fit(training_data,y_train)
bag_mod.fit(training_data,y_train)
rf_mod.fit(training_data,y_train)
ada_mod.fit(training_data,y_train)
svm_mod.fit(training_data,y_train)
print ('\nTest score --->   ', accuracy_tree)
print ('\nFeature importances:\n')
importances = cla_tree_pipe1.named_steps['tree_cla'].feature_importances_
indices = np.argsort(importances)[::-1]


# Plot the feature importances
plt.figure(figsize=(8, 6))
plt.title("Feature importances")
sns.barplot(indices, y=importances[indices])
plt.show()


estimators = []
estimators.append(('Normalizer', Normalizer()))
estimators.append(('bag_cla', BaggingClassifier()))
cla_bag_pipe1 = Pipeline(estimators)
cla_bag_pipe1.set_params(bag_cla__base_estimator=tree.DecisionTreeClassifier(max_depth=estimator_treecla.best_params_['tree_cla__max_depth']),\
                         bag_cla__n_estimators=500, bag_cla__random_state=seed)

cla_bag_pipe1.fit(X_train,y_train)
accuracy_bagging=cla_bag_pipe1.score(X_test, y_test)
print ('\nBagging Test score --->   ', accuracy_bagging)

#%%

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
                  

Exemple #16
0
                                                    wine_target,
                                                    test_size=0.3)

# -----------------------------------------------------------------------------
# Bagging

param_grid = {
    'base_estimator': [
        tree.DecisionTreeClassifier(),
        KNeighborsClassifier(n_neighbors=3),
        GaussianNB()
    ],
    'n_estimators':
    np.arange(1, 100)
}
estimator = BaggingClassifier()
optimizeEstimator('Bagging', estimator, param_grid)

# -----------------------------------------------------------------------------
# Boosting

param_grid = {
    'base_estimator': [tree.DecisionTreeClassifier(),
                       GaussianNB()],
    'n_estimators': np.arange(1, 100),
    'learning_rate': np.arange(0.1, 1.01, 0.1),
    'algorithm': ['SAMME', 'SAMME.R']
}
estimator = AdaBoostClassifier()
optimizeEstimator('Boosting - AdaBoost', estimator, param_grid)
Exemple #17
0
    "Bagging",
    "ERT",
    "GB"
]

classifiers = [
    # KNeighborsClassifier(n_neighbors=50, leaf_size=1),
    DecisionTreeClassifier(max_depth=10000),
    RandomForestClassifier(n_estimators=1000,
                           max_depth=100000,
                           max_features='sqrt'),
    AdaBoostClassifier(n_estimators=1000),
    MLPClassifier(alpha=1, activation='logistic'),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    BaggingClassifier(n_estimators=1000, max_features=30),
    ExtraTreesClassifier(n_estimators=1000,
                         max_depth=10000,
                         min_samples_split=2,
                         random_state=0),
    GradientBoostingClassifier(n_estimators=1000,
                               learning_rate=1.0,
                               max_depth=1,
                               random_state=0)
]
loop = int(((x.shape[1])**(1 / 2.0)) * 4)

n_components_score = {}
n_clusters_score = {}
# for clust in range(x.shape[1]):
#     # plsca = PLSCanonical(n_components=clust+10)
Exemple #18
0
'''

import numpy as np
x_train = np.load('dataset/oversampled/smote/x_train.npy')
x_test = np.load('dataset/normalized/x_test.npy')
x_val = np.load('dataset/partitioned/x_val.npy')
y_train = np.load('dataset/oversampled/smote/y_train.npy')
y_test = np.load('dataset/normalized/y_test.npy')
y_val = np.load('dataset/partitioned/y_val.npy')

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score, confusion_matrix

base_cls = DecisionTreeClassifier()
num_trees = 50
model = BaggingClassifier(base_estimator=base_cls,
                          n_estimators=num_trees,
                          random_state=0)

model.fit(x_train, y_train)

y_pred = model.predict(x_val)

print(confusion_matrix(y_val, y_pred))

y_pred = model.predict(x_test)

print(confusion_matrix(y_test, y_pred))
    def trainingMethod(self):

        self.model = BaggingClassifier(n_estimators=self.n_estimators,
                                       bootstrap=self.bootstrap,
                                       n_jobs=-1)
        self.BagginAlgorithm = self.model.fit(self.dataset, self.target)
Exemple #20
0
kouzhao = targets['kouzhao']
laxiang = targets['laxiang']
maozi = targets['maozi']
yanjing = targets['yanjing']

RANDOM_STATE = 500
X_train, X_test, y_db_train, y_db_test = train_test_split(
    bottlenecks, kouzhao, test_size=0.15, random_state=RANDOM_STATE)
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

t1 = time.time()
# clf = DecisionTreeClassifier().fit(X_train, y_db_train)
clfb = BaggingClassifier(base_estimator=DecisionTreeClassifier(),
                         max_samples=0.5,
                         max_features=0.5).fit(X_train, y_db_train)

# predict = clf.predict(X_test)
predict = clfb.predict(X_test)

# print(clf.score(X_test, y_db_test))
print('****************metrics***************')
print(classification_report(y_db_test, predict))
print('-------precision_score:')
print(precision_score(y_db_test, predict))
print('-------recall_score:')
print(recall_score(y_db_test, predict))
print('-------F1_score:')
print(f1_score(y_db_test, predict))
print('------------time:')
Exemple #21
0
Indian Liver Patient dataset from the UCI machine learning 
repository. Your task is to predict whether a patient suffers 
from a liver disease using 10 features including Albumin, age and gender. 
You'll do so using a Bagging Classifier.
"""

# Import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier

# Import BaggingClassifier
from sklearn.ensemble import BaggingClassifier
# Instantiate dt
dt = DecisionTreeClassifier(random_state=1)

# Instantiate bc
bc = BaggingClassifier(base_estimator=dt, n_estimators=50, random_state=1)
"""
Evaluate Bagging performance
Now that you instantiated the bagging classifier, 
it's time to train it and evaluate its test set accuracy.
The Indian Liver Patient dataset is processed for you and split
into 80% train and 20% test. The feature matrices X_train and X_test,
as well as the arrays of labels y_train and y_test are available in your workspace.
In addition, we have also loaded the bagging classifier bc that you instantiated in 
the previous exercise and the function accuracy_score() from sklearn.metrics.
"""

# Fit bc to the training set
bc.fit(X_train, y_train)

# Predict test set labels
Exemple #22
0
]

sections = [
    "Accuracy", "Sensitivity", "Specificity", "F-1 klasa mniejszosciowa",
    'G-mean'
]

tables = []
for tab in range(5):
    table = Tabular('c|cccccc')
    table.add_row(('', "Bag TREE", "Bag TREE SMOTE", "AB TREE",
                   "AB TREE SMOTE", "Stacking", "Stacking SMOTE"))
    table.add_hline()
    tables.append(table)

clf1 = BaggingClassifier(tree.DecisionTreeClassifier(max_depth=3),
                         n_estimators=50)
clf2 = AdaBoostClassifier(tree.DecisionTreeClassifier(max_depth=3),
                          n_estimators=50)

meta = MLPClassifier(solver='lbfgs', random_state=1)
stacking = StackingCVClassifier(classifiers=[
    KNeighborsClassifier(),
    tree.DecisionTreeClassifier(max_depth=3),
    GaussianNB()
],
                                meta_classifier=meta)

# liczba powtorzen klasyfikacji
iterations = 10

# liczba fold w sprawdzianie krzyzowym
Exemple #23
0
tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
tree_clf.fit(X, y)

deep_tree_clf2 = DecisionTreeClassifier(min_samples_leaf=4, random_state=42)
deep_tree_clf2.fit(X, y)

polynomial_svm_clf = Pipeline([("poly_features", PolynomialFeatures(degree=3)),
                               ("scaler", StandardScaler()),
                               ("svm_clf",
                                LinearSVC(C=10, loss="hinge",
                                          random_state=42))])
polynomial_svm_clf.fit(X, y)

rf = RandomForestClassifier()
rf.fit(X, y)
bag = BaggingClassifier()
bag.fit(X, y)
knn3 = KNeighborsClassifier(3)
knn3.fit(X, y)

#-----------------------------------------------------------------
# plot boundary
#-----------------------------------------------------------------
# plt.figure(figsize=(8, 4))
plot_decision_boundary(tree_clf, X, y)
plot_decision_boundary(tree_clf, X, y, newPlot=True)
plot_decision_boundary(deep_tree_clf2, X, y)
plot_decision_boundary(deep_tree_clf2, X, y, newPlot=False)
plot_decision_boundary(polynomial_svm_clf, X, y)
plt.figure()
plot_decision_boundary(bag, X, y)
Exemple #24
0
def featureSelection(globalIndex, coefReduction):

    # a few hard-coded values
    numberOfFolds = 10

    # list of classifiers, selected on the basis of our previous paper "
    classifierList = [
        # ensemble
        #[AdaBoostClassifier(), "AdaBoostClassifier"],
        #[AdaBoostClassifier(n_estimators=300), "AdaBoostClassifier(n_estimators=300)"],
        #[AdaBoostClassifier(n_estimators=1500), "AdaBoostClassifier(n_estimators=1500)"],
        #[BaggingClassifier(), "BaggingClassifier"],
        [
            GradientBoostingClassifier(n_estimators=300),
            "GradientBoostingClassifier(n_estimators=300)"
        ],
        [
            RandomForestClassifier(n_estimators=300),
            "RandomForestClassifier(n_estimators=300)"
        ],
        [LogisticRegression(), "LogisticRegression"],
        [PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"],
        [SGDClassifier(), "SGDClassifier"],
        [SVC(kernel='linear'), "SVC(linear)"],
        [RidgeClassifier(), "RidgeClassifier"],
        [
            BaggingClassifier(n_estimators=300),
            "BaggingClassifier(n_estimators=300)"
        ],
        #[ExtraTreesClassifier(), "ExtraTreesClassifier"],
        #[ExtraTreesClassifier(n_estimators=300), "ExtraTreesClassifier(n_estimators=300)"],
        #[GradientBoostingClassifier(), "GradientBoostingClassifier"], # features_importances_
        #[GradientBoostingClassifier(n_estimators=300), "GradientBoostingClassifier(n_estimators=300)"],
        #[GradientBoostingClassifier(n_estimators=1000), "GradientBoostingClassifier(n_estimators=1000)"],
        #[RandomForestClassifier(), "RandomForestClassifier"],
        #[RandomForestClassifier(n_estimators=300), "RandomForestClassifier(n_estimators=300)"],
        #[RandomForestClassifier(n_estimators=1000), "RandomForestClassifier(n_estimators=1000)"], # features_importances_

        # linear
        #[ElasticNet(), "ElasticNet"],
        #[ElasticNetCV(), "ElasticNetCV"],
        #[Lasso(), "Lasso"],
        #[LassoCV(), "LassoCV"],
        #[LogisticRegression(), "LogisticRegression"], # coef_
        #[LogisticRegressionCV(), "LogisticRegressionCV"],
        #[PassiveAggressiveClassifier(), "PassiveAggressiveClassifier"], # coef_
        #[RidgeClassifier(), "RidgeClassifier"], # coef_
        #[RidgeClassifierCV(), "RidgeClassifierCV"],
        #[SGDClassifier(), "SGDClassifier"], # coef_
        #[SVC(kernel='linear'), "SVC(linear)"], # coef_, but only if the kernel is linear...the default is 'rbf', which is NOT linear

        # naive Bayes
        #[BernoulliNB(), "BernoulliNB"],
        #[GaussianNB(), "GaussianNB"],
        #[MultinomialNB(), "MultinomialNB"],

        # neighbors
        #[KNeighborsClassifier(), "KNeighborsClassifier"], # no way to return feature importance
        # TODO this one creates issues
        #[NearestCentroid(), "NearestCentroid"], # it does not have some necessary methods, apparently
        #[RadiusNeighborsClassifier(), "RadiusNeighborsClassifier"],

        # tree
        #[DecisionTreeClassifier(), "DecisionTreeClassifier"],
        #[ExtraTreeClassifier(), "ExtraTreeClassifier"],
    ]

    # this is just a hack to check a few things
    #classifierList = [
    #		[RandomForestClassifier(), "RandomForestClassifier"]
    #		]

    print("Loading dataset...")
    X, y, biomarkerNames = loadDataset(globalIndex)

    if (int(len(y)) > 100):
        numberOfTopFeatures = 100
    else:
        numberOfTopFeatures = int(len(y) - 1)

    numberOfTopFeatures = int(numberOfTopFeatures / coefReduction)
    # create folder
    folderName = "FS"
    if not os.path.exists(folderName): os.makedirs(folderName)

    # prepare folds
    skf = StratifiedKFold(n_splits=numberOfFolds, shuffle=True)
    indexes = [(training, test) for training, test in skf.split(X, y)]

    # this will be used for the top features
    topFeatures = dict()

    # iterate over all classifiers
    classifierIndex = 0

    globalAccuracy = 0

    for originalClassifier, classifierName in classifierList:

        print("\nClassifier " + classifierName)
        classifierPerformance = []
        classifierTopFeatures = dict()

        # iterate over all folds
        for train_index, test_index in indexes:

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            # let's normalize, anyway
            scaler = StandardScaler()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            classifier = copy.deepcopy(originalClassifier)
            classifier.fit(X_train, y_train)
            scoreTraining = classifier.score(X_train, y_train)
            scoreTest = classifier.score(X_test, y_test)

            print("\ttraining: %.4f, test: %.4f" % (scoreTraining, scoreTest))
            classifierPerformance.append(scoreTest)

            # now, let's get a list of the most important features, then mark the ones in the top X
            orderedFeatures = relativeFeatureImportance(classifier)
            for i in range(0, numberOfTopFeatures):

                feature = int(orderedFeatures[i][1])

                if feature in topFeatures:
                    topFeatures[feature] += 1
                else:
                    topFeatures[feature] = 1

                if feature in classifierTopFeatures:
                    classifierTopFeatures[feature] += 1
                else:
                    classifierTopFeatures[feature] = 1

        line = "%s\t%.4f\t%.4f\n" % (classifierName,
                                     np.mean(classifierPerformance),
                                     np.std(classifierPerformance))

        globalAccuracy = globalAccuracy + np.mean(classifierPerformance)

        print(line)
        fo = open("./data/results.txt", 'a')
        fo.write(line)
        fo.close()
        # save most important features for the classifier
        with open(os.path.join(folderName, classifierName + ".csv"),
                  "w") as fp:

            fp.write("feature,frequencyInTop" + str(numberOfTopFeatures) +
                     "\n")

            # transform dictionary into list
            listOfClassifierTopFeatures = [(key, classifierTopFeatures[key])
                                           for key in classifierTopFeatures]
            listOfClassifierTopFeatures = sorted(listOfClassifierTopFeatures,
                                                 key=lambda x: x[1],
                                                 reverse=True)

            for feature, frequency in listOfClassifierTopFeatures:
                fp.write(
                    str(biomarkerNames[feature]) + "," +
                    str(float(frequency / numberOfFolds)) + "\n")

    # save most important features overall
    with open(
            os.path.join(folderName,
                         "global_" + str(int(globalIndex)) + ".csv"),
            "w") as fp:

        fp.write("feature,frequencyInTop" + str(numberOfTopFeatures) + "\n")

        # transform dictionary into list
        listOfTopFeatures = [(key, topFeatures[key]) for key in topFeatures]
        listOfTopFeatures = sorted(listOfTopFeatures,
                                   key=lambda x: x[1],
                                   reverse=True)

        for feature, frequency in listOfTopFeatures:
            fp.write(
                str(biomarkerNames[feature]) + "," +
                str(float(frequency / numberOfFolds)) + "\n")
    globalAccuracy = globalAccuracy / 8
    return globalAccuracy
Exemple #25
0
        ax.set_title('Confusion Matrix - Max Depth {} Bag Size {}'.format(
            d, k))
    # plt.show()
    print("Bagging Custom Implementation Completed")

    # # Part c.1 Bagging: Sci-Kit Implementation
    for index, i in enumerate(COMBINATION):
        d, k = i
        temp = {
            'Algorithm': "Bagging",
            'Depth': d,
            'Implementation': 'Sci-Kit',
            'Bag Size': k,
            'Accuracy': 0
        }
        clf = BaggingClassifier(DecisionTreeClassifier(max_depth=d),
                                n_estimators=k)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        temp['Accuracy'] = accuracy_score(y_test, y_pred)
        cf_matrix = confusion_matrix(y_test, y_pred)
        comparision_df = comparision_df.append(temp, ignore_index=True)
        fig, ax = plot_confusion_matrix(conf_mat=cf_matrix, figsize=(5, 8))
        ax.set_title('Confusion Matrix - Max Depth {} Bag Size {}'.format(
            d, k))
    # plt.show()
    print("Bagging Sci-kit Implementation Completed")

    MAX_TREE_DEPTH = [1, 2]
    BAG_SIZE = [20, 40]
    COMBINATION = [(i, j) for i in MAX_TREE_DEPTH for j in BAG_SIZE]
    def score(self, X, Y):
        return np.sum(self.predict(X) == Y) / float(len(Y))


if __name__ == '__main__':
    from sklearn.datasets import make_classification
    from sklearn.ensemble import BaggingClassifier
    X, Y = make_classification(n_samples=500, n_features=20, n_informative=4)
    X_train = X[:300, :]
    Y_train = Y[:300]
    X_test = X[300:, :]
    Y_test = Y[300:]

    max_depth = 5
    rho = 0.2
    nb_epoch = 2
    n_iter = 5
    n_estimators = 1000

    gae = Generate_Adversarial_Ensemble(max_depth, rho, n_iter, nb_epoch,
                                        n_estimators)
    gae.fit(X_train, Y_train)
    print gae.predict_proba(X_test)
    print gae.predict(X_test)
    print gae.score(X_test, Y_test)

    #Bagging
    bc = BaggingClassifier(n_estimators=1000)
    bc.fit(X_train, Y_train)
    print bc.score(X_test, Y_test)
Exemple #27
0
accuracy_score(y1_test, y_pred_rf)

print(classification_report(y1_test, y_pred_rf))

x = cancer_data1.drop('diagnosis', axis=1)
y = cancer_data1['diagnosis']

pd.DataFrame(rf.feature_importances_, index=x.columns)

plt.bar(range(len(rf.feature_importances_)), rf.feature_importances_)
plt.show()
"""### MODELLING_Bagging Classifier"""

from sklearn.ensemble import BaggingClassifier

bc = BaggingClassifier(n_estimators=350, base_estimator=dt, random_state=60)

model_bc = bc.fit(x1_train, y1_train)

y_pred_bc = model_bc.predict(x1_test)

confusion_matrix(y1_test, y_pred_bc)

accuracy_score(y1_test, y_pred_bc)

print(classification_report(y1_test, y_pred_bc))
"""## MODELLING_AdaBoostClassifier"""

from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(base_estimator=dt, n_estimators=350, random_state=60)
Exemple #28
0
print(trainData.head())
print(trainTargets.head())
print(np.asarray(trainTargets['Class']))
print(testData.head())

# Initialize the DSBox Encoder

enc = Encoder()
enc.set_training_data(inputs=trainData)
enc.fit()

print(type(enc.get_params()))
print(enc.get_params())

imputer = Imputer()
model = BaggingClassifier()

print(trainData.columns)

encodedTrainData = enc.produce(inputs=trainData)
processedTrainData = imputer.fit_transform(encodedTrainData)
trainedModel = model.fit(processedTrainData, np.asarray(trainTargets['Class']))

print(encodedTrainData.columns)

predictedTargets = trainedModel.predict(
    imputer.fit_transform(enc.produce(inputs=testData)))
print(predictedTargets)

# Outputs the predicted targets in the location specified in the JSON configuration file
with open(jsonCall['output_file'], 'w') as outputFile:
plt.show()

svmRoc = roc_auc[i]
svmFPR = fpr[i]
svmTPR = tpr[i]

#bagging
from sklearn.ensemble import BaggingClassifier

parameters = {
    'n_estimators': list(range(20, 40, 5)),
    'max_samples': np.linspace(0.1, 1.0, 10),
    'max_features': np.linspace(0.1, 1.0, 10)
}
#'tol':[1e-3,1e-4,1e-5],'C':np.linspace(0.1,1.0,50).round(2),'fit_intercept':(True,False),'max_iter':[20,100,1000]}
bag = BaggingClassifier()
bagGSCV = GridSearchCV(bag, parameters, scoring='precision_macro', cv=5)
bagGSCV.fit(X_train, Y_train)

print("Best parameters set found on development set:")
print()
print(bagGSCV.best_params_)
print()

sorted(bagGSCV.cv_results_.keys())
tempDict = bagGSCV.best_params_
bagParam = ', '.join('%s = %s' % (k, tempDict[k]) for k in tempDict.keys())

y_true, y_pred = Y_validation, bagGSCV.predict(X_validation)
temp = metrics.precision_recall_fscore_support(y_true, y_pred)
bagPrec = mean(temp[0])
Exemple #30
0
"""修改features_path"""
features_path = '../features/data_lda.pkl'
"""修改clf_name, clfs_dict"""
clf_name = 'xgb'
base_clf = LinearSVC()

clfs = {
    'lg':
    LogisticRegression(),
    'svm':
    LinearSVC(),
    'bagging':
    BaggingClassifier(base_estimator=base_clf,
                      n_estimators=60,
                      max_samples=1.0,
                      max_features=1.0,
                      random_state=1,
                      n_jobs=1,
                      verbose=1),
    'rf':
    RandomForestClassifier(n_estimators=10, criterion='gini'),
    'adaboost':
    AdaBoostClassifier(base_estimator=base_clf, n_estimators=50),
    'gbdt':
    GradientBoostingClassifier(),
    'xgb':
    xgb.XGBClassifier(max_depth=6,
                      learning_rate=0.1,
                      n_estimators=100,
                      silent=True,
                      objective='multi:softmax',