Beispiel #1
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [145, None, check_random_state(None), check_random_state(145)]:
        clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
Beispiel #2
0
def test_xgboost_feature_importance():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    clf = XGBoostClassifier(n_estimators=1, max_depth=1)
    clf.fit(X, y)
    importances = clf.get_feature_importances()
    original_features = set(X.columns)
    importances_features = set(importances.index)
    print(original_features, importances_features)
    assert original_features == importances_features, 'feature_importances_ return something wrong'

    assert len(original_features) == len(clf.feature_importances_)
Beispiel #3
0
def test_xgboost_feature_importance():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    clf = XGBoostClassifier(n_estimators=1, max_depth=1)
    clf.fit(X, y)
    importances = clf.get_feature_importances()
    original_features = set(X.columns)
    importances_features = set(importances.index)
    print(original_features, importances_features)
    assert original_features == importances_features, 'feature_importances_ return something wrong'

    assert len(original_features) == len(clf.feature_importances_)
Beispiel #4
0
def test_feature_importances():
    clf = XGBoostClassifier()
    X, y, sample_weight = generate_classification_data()
    clf.fit(X, y, sample_weight=sample_weight)
    # checking feature importance (three ways)

    res_default = clf.xgboost_classifier.get_fscore()
    res2 = clf._get_fscore()
    res3 = clf.feature_importances_

    assert res_default == res2, res_default
    for i, val in enumerate(res3):
        if val > 0.0:
            assert val == res_default['f' + str(i)]
Beispiel #5
0
def test_complex_stacking_xgboost():
    # Ada over kFold over xgboost
    base_kfold = FoldingClassifier(base_estimator=XGBoostClassifier())
    check_classifier(SklearnClassifier(
        clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)),
                     has_staged_pp=False,
                     has_importances=False)
Beispiel #6
0
def test_basic_xgboost():
    X, y, w = generate_classification_data(n_classes=2)
    clf = XGBoostClassifier(n_estimators=10).fit(X, y)
    clf.predict(X)
    clf.predict_proba(X)
    # testing that returned features in importances are correct and in the same order
    assert numpy.all(clf.features == clf.get_feature_importances().index)
Beispiel #7
0
def test_xgboost_works_with_different_dtypes():
    dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
    for dtype in dtypes:
        X, y, weights = generate_classification_data(n_classes=2, distance=5)
        clf = XGBoostClassifier(n_estimators=10)
        clf.fit(X.astype(dtype=dtype),
                y.astype(dtype=dtype),
                sample_weight=weights.astype(dtype))
        probabilities = clf.predict_proba(X.astype(dtype))

    # testing single pandas.DataFrame with different dtypes
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    import pandas
    X = pandas.DataFrame()
    for dtype in dtypes:
        X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
    clf = XGBoostClassifier(n_estimators=10)
    clf.fit(X, y, sample_weight=weights)
    probabilities = clf.predict_proba(X)
Beispiel #8
0
def very_basic_xgboost_test():
    X, y, w = generate_classification_data(n_classes=2)
    clf = XGBoostClassifier(n_estimators=10).fit(X, y)
    clf.predict(X)
    clf.predict_proba(X)
    # testing that returned features in importances are correct and in the same order
    assert numpy.all(clf.features == clf.get_feature_importances().index)
Beispiel #9
0
def test_xgboost_works_with_different_dtypes():
    dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
    for dtype in dtypes:
        X, y, weights = generate_classification_data(n_classes=2, distance=5)
        clf = XGBoostClassifier(n_estimators=10)
        clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype))
        probabilities = clf.predict_proba(X.astype(dtype))

    # testing single pandas.DataFrame with different dtypes
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    import pandas
    X = pandas.DataFrame()
    for dtype in dtypes:
        X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
    clf = XGBoostClassifier(n_estimators=10)
    clf.fit(X, y, sample_weight=weights)
    probabilities = clf.predict_proba(X)
Beispiel #10
0
def test_feature_splitter():
    # testing splitter
    from rep.metaml import FeatureSplitter

    X, y, sample_weight = generate_classification_data(n_classes=3)
    split_column = X.columns[0]
    splitters = numpy.random.randint(0, 3, size=len(X))
    X[split_column] = splitters
    X.ix[splitters == 1, :] += 4
    X.ix[splitters == 2, :] -= 4
    fs = FeatureSplitter(base_estimator=XGBoostClassifier(features=list(
        X.columns[1:]),
                                                          n_estimators=10,
                                                          max_depth=3),
                         split_feature=split_column)
    fs.fit(X, y, sample_weight=sample_weight)
    assert fs.score(X, y) > 0.9
Beispiel #11
0
    def _make_clf(self, clf, bagging=None):
        """Creates a classifier from a dict or returns the clf"""
        if isinstance(clf, dict):
            key, val = clf.popitem()
            try:
                val = self.__DEFAULT_CLF_CFG.get(key) if val is None else val
            except KeyError:
                logger.error(str(val) + " not an implemented classifier.")
                raise

            temp_bagging = val.pop('bagging', bagging)
            bagging = temp_bagging if bagging is None else bagging

            if key == 'rdf':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(RandomForestClassifier(**config_clf))
            elif key == 'erf':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(ExtraTreesClassifier(**config_clf))
            elif key == 'nn':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = TheanetsClassifier(**config_clf)
            elif key == 'ada':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(AdaBoostClassifier(**config_clf))
            elif key == 'gb':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = SklearnClassifier(GradientBoostingClassifier(**config_clf))
            elif key == 'xgb':
                config_clf = dict(val)  # possible multi-threading arguments
                clf = XGBoostClassifier(**config_clf)
            elif hasattr(clf, 'fit'):
                bagging = False  # return the classifier

            # bagging over the instantiated estimators
            if isinstance(bagging, int) and bagging >= 1:
                bagging = dict(self.__DEFAULT_BAG_CFG, n_estimators=bagging)
            if isinstance(bagging, dict):
                # TODO: implement multi-thread:
                bagging.update({'base_estimator': clf})
                clf = SklearnClassifier(BaggingClassifier(**bagging))
        else:
            raise ValueError(str(clf) + " not valid as a classifier.")

        clf = {key: clf}
        return clf
Beispiel #12
0
def test_feature_splitter():
    # testing splitter
    from rep.metaml import FeatureSplitter

    X, y, sample_weight = generate_classification_data(n_classes=3)
    split_column = X.columns[0]
    splitters = numpy.random.randint(0, 3, size=len(X))
    X[split_column] = splitters
    X.ix[splitters == 1, :] += 4
    X.ix[splitters == 2, :] -= 4
    fs = FeatureSplitter(base_estimator=XGBoostClassifier(n_estimators=10, max_depth=3),
                         split_feature=split_column, train_features=list(X.columns[1:]))
    fs.fit(X, y, sample_weight=sample_weight)
    assert fs.score(X, y) > 0.9
    p_final = fs.predict_proba(X)
    for p in fs.staged_predict_proba(X):
        pass
    assert numpy.allclose(p_final, p), 'end of iterations differs from expected'
Beispiel #13
0
def main(job_id, params):
    print "Anything printed here will end up in the output directory for job ", job_id
    print params

    if job_id > 50: file = open("optimisation_done_flag", "a").close()

    comp_file_list = [(
        "/Users/weisser/MIT_Dropbox/MIT/Research/learningml/learningml/GoF/data/accept_reject/sin1diff_data/data_sin1diff_5_and_5_periods4D_sample_optimisation_0.txt",
        "/Users/weisser/MIT_Dropbox/MIT/Research/learningml/learningml/GoF/data/accept_reject/sin1diff_data/data_sin1diff_5_and_6_periods4D_sample_optimisation_0.txt"
    )]

    clf = XGBoostClassifier(base_score=0.5,
                            colsample=1.0,
                            eta=params['eta'],
                            features=None,
                            gamma=None,
                            max_depth=6,
                            min_child_weight=1.0,
                            missing=-999.0,
                            n_estimators=params['n_estimators'],
                            nthreads=16,
                            num_feature=None,
                            random_state=0,
                            scale_pos_weight=1.0,
                            subsample=1.0,
                            verbose=0)

    result = classifier_eval.classifier_eval(
        name="xgb_4Dsin_5_6_CPV_syst_0_01_",
        title="xgb Sin 5 6 periods syst0.01",
        comp_file_list=comp_file_list,
        clf=clf,
        mode="spearmint_optimisation",
        scoring="chi2",
        no_bins=5,
        systematics_fraction=0.01)

    with open(
            "xgb_4Dsin_5_6_CPV_syst_0_01__chi2scoring_5_optimisation_values.txt",
            "a") as myfile:
        myfile.write(
            str(params["n_estimators"][0]) + "\t" + str(params["eta"][0]) +
            "\t" + str(result) + "\n")
    return result
Beispiel #14
0
def name_to_nclf(name):
    #This function gives some standard versions of common machine learning classifiers.
    if name == "dt":
        anclf = nclf('dt', tree.DecisionTreeClassifier(),
                     ['max_depth', 'min_samples_split'], [[1, 60], [2, 100]])
    if name == "bdt":
        anclf = nclf(
            'bdt',
            AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(
                max_depth=2)), ['learning_rate', 'n_estimators'],
            [[0.01, 2.0], [100, 1000]])
    if name == "xgb":
        anclf = nclf('xgb', XGBoostClassifier(), ['n_estimators', 'eta'],
                     [[10, 1000], [0.01, 1.0]])
    if name == "svm":
        anclf = nclf('svm', SVC(probability=True, cache_size=7000),
                     ['C', 'gamma'], [[1.0, 1000.0], [1E-6, 0.1]])
    if name == "nn":
        anclf = nclf('nn', "no classifier needed for nn",
                     ['n_hidden_layers', 'dimof_middle'], [[0, 1], [100, 500]])
    return anclf
Beispiel #15
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [
            145, None,
            check_random_state(None),
            check_random_state(145)
    ]:
        clf1 = XGBoostClassifier(n_estimators=5,
                                 max_depth=1,
                                 subsample=0.1,
                                 random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5,
                                 max_depth=1,
                                 subsample=0.1,
                                 random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(
                clf1.predict_proba(X),
                clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(
                clf1.predict_proba(X),
                clf2.predict_proba(X)), 'seed: {}'.format(random_state)
    def fit_one(self, data, model_y, model_stereo):

        event_ids = numpy.unique(data.EventID.values)

        if self.train_size != None:
            event_ids_train, event_ids_test = train_test_split(
                event_ids, train_size=self.train_size, random_state=42)
        else:
            event_ids_test = event_ids

        # fit train tracks
        if self.train_size != None:

            tracks_train = {}

            p = Pool(self.processes)
            results_train = p.map(
                tracks_reconstruction,
                zip(event_ids_train, [data] * len(event_ids_train),
                    [model_y] * len(event_ids_train),
                    [model_stereo] * len(event_ids_train)))
            tracks_train = merge_dicts(results_train)

        # train clf
        if self.train_size != None:

            sc = SuperCombinator()

            combination_data = sc.data_collection(tracks_train, data)

            X_data = combination_data[combination_data.columns[:-1]].values
            y_data = combination_data.label.values

            xgb_base = XGBoostClassifier(n_estimators=1000,
                                         colsample=0.7,
                                         eta=0.01,
                                         nthreads=1,
                                         subsample=0.7,
                                         max_depth=8)
            folding = FoldingClassifier(xgb_base, n_folds=10, random_state=11)
            folding.fit(X_data, y_data)

            clf = folding.estimators[0]

        else:
            clf = None

        # fit test tracks
        tracks_test = {}

        p = Pool(self.processes)
        results_test = p.map(
            tracks_reconstruction,
            zip(event_ids_test, [data] * len(event_ids_test),
                [model_y] * len(event_ids_test),
                [model_stereo] * len(event_ids_test)))
        tracks_test = merge_dicts(results_test)

        # quality
        p = Pool(self.processes)
        effs = p.map(
            get_eff_value,
            zip(event_ids_test, [data] * len(event_ids_test),
                [tracks_test] * len(event_ids_test),
                [clf] * len(event_ids_test)))

        eff = 100. * numpy.array(effs).sum() / len(effs)

        return eff
Beispiel #17
0
if typedata=="Data": 
  arr = dataset20.to_records()
  array2root(arr, outputCentral+"_AppliedTo20pOfPlain"+typedata+".root" , 'tree', 'recreate')
#

for ii in range(0,3):
   if ii==0 :
     train= trainFeaturesplot
     Var='All'
   if ii==1 :
     train= trainFeaturesObvious
     Var='Mass'
   if ii==2 :
     train= trainFeaturesHH
     Var='HH'
   xgb = XGBoostClassifier(train) #,
   original = xgboriginal.XGBClassifier(train)
   """
            n_estimators =  200,
            eta = 0.1,
            max_depth = 7,
            subsample = 0.9,
            colsample = 0.6)
   """
   xgb.fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64))) 
   prob = xgb.predict_proba(valdatasetmix[train].astype(np.float64) )
   if ii==0 : reportAll = xgb.test_on(traindatasetmix[trainFeaturesplot].astype(np.float64), traindatasetmix.target.astype(np.bool))
   if ii==1 : reportObvious = xgb.test_on(traindatasetmix[trainFeaturesObvious].astype(np.float64), traindatasetmix.target.astype(np.bool))
   if ii==2 : reportHH = xgb.test_on(traindatasetmix[trainFeaturesHH].astype(np.float64), traindatasetmix.target.astype(np.bool))
   # compatible with lustr/lxplus
   #features =  ['costhst_DiJets[0]_HH', 'costhst_Jets[0]_DiJets[0]', 'costhst_Jets[2]_DiJets[1]', 'CSV3', 'CSV4', 'Jets[0].eta()', 'Jets[1].eta()', 'Jets[2].eta()', 'Jets[3].eta()', 'HT_other_jets']
Beispiel #18
0
def test_xgboost():
    check_classifier(XGBoostClassifier(), n_classes=2)
    check_classifier(XGBoostClassifier(), n_classes=4)
    check_regression(XGBoostRegressor())
Beispiel #19
0
#nclf_list = [nclf('xgb',XGBoostClassifier(),['n_estimators','eta'], [[10,1000],[0.01,1.0]], param_opt=[1000.,0.9738])]
#nclf_list = [nclf('nn',"no classifier needed for nn", ['n_hidden_layers','dimof_middle'], [[0,1],[100,500]],param_opt=[1,210])]
#nclf_list = [name_to_nclf("nn")]

#nclf_list = [name_to_nclf("bdt"), name_to_nclf("xgb"), name_to_nclf("svm"), name_to_nclf("nn")]
#nclf_list = [name_to_nclf("bdt"), name_to_nclf("xgb"),  name_to_nclf("nn")]
#nclf_list = [name_to_nclf("svm")]

nclf_list = [
    nclf('bdt',
         AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(
             max_depth=2)), ['learning_rate', 'n_estimators'],
         [[0.01, 2.0], [1, 1000]],
         param_opt=[0.01, 992]),
    nclf('xgb',
         XGBoostClassifier(), ['n_estimators', 'eta'],
         [[10, 1000], [0.01, 1.0]],
         param_opt=[423, 0.0104]),
    nclf('nn',
         "no classifier needed for nn", ['n_hidden_layers', 'dimof_middle'],
         [[0, 1], [100, 500]],
         param_opt=[1, 210]),
    nclf('svm',
         SVC(probability=True, cache_size=7000), ['C', 'gamma'],
         [[1.0, 1000.0], [1E-6, 0.1]],
         param_opt=[583.3, 0.0012])
]
#nclf_list = [nclf('bdt',AdaBoostClassifier(base_estimator=tree.DecisionTreeClassifier(max_depth=2)), ['learning_rate','n_estimators'], [[0.01,2.0],[1,1000]], param_opt=[0.432, 18]),  nclf('xgb',XGBoostClassifier(), ['n_estimators','eta'], [[10,1000],[0.01,1.0]], param_opt=[619, 0.1489]),  nclf('nn',"no classifier needed for nn", ['n_hidden_layers','dimof_middle'], [[0,1],[100,500]],param_opt=[0,174]), nclf('svm',SVC(probability=True, cache_size=7000), ['C','gamma'], [[1.0,1000.0],[1E-6,0.1]], param_opt=[5.269, 0.00453])]

systematics_fraction = 0.01
Beispiel #20
0
def test_simple_stacking_xgboost():
    base_xgboost = XGBoostClassifier()
    classifier = SklearnClassifier(
        clf=AdaBoostClassifier(base_estimator=base_xgboost, n_estimators=3))
    check_classifier(classifier, has_staged_pp=False)
Beispiel #21
0
def clf_mayou(data1, data2, n_folds=3, n_base_clf=5):
    """DEVELOPEMENT, WIP. Test a setup of clf involving bagging and stacking."""
    # import raredecay.analysis.ml_analysis as ml_ana
    # import pandas as pd
    import copy

    from rep.estimators import SklearnClassifier, XGBoostClassifier
    from rep.metaml.folding import FoldingClassifier
    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
    from sklearn.ensemble import BaggingClassifier  # , VotingClassifier, AdaBoostClassifier
    from rep.estimators.theanets import TheanetsClassifier
    from sklearn.linear_model import LogisticRegression
    from rep.metaml.cache import CacheClassifier

    from rep.report.metrics import RocAuc

    import rep.metaml.cache
    from rep.metaml._cache import CacheHelper
    rep.metaml.cache.cache_helper = CacheHelper('/home/mayou/cache', 100000)

    #    data1.make_folds(n_folds)
    #    data2.make_folds(n_folds)
    output = {}

    # for i in range(n_folds):
    xgb_clf = XGBoostClassifier(n_estimators=350,
                                eta=0.1,
                                max_depth=4,
                                nthreads=3)
    xgb_folded = FoldingClassifier(base_estimator=xgb_clf,
                                   stratified=True,
                                   parallel_profile='threads-2')
    xgb_bagged = BaggingClassifier(base_estimator=xgb_folded,
                                   n_estimators=n_base_clf,
                                   bootstrap=False)
    xgb_bagged = SklearnClassifier(xgb_bagged)
    xgb_big_stacker = copy.deepcopy(xgb_bagged)
    xgb_bagged = CacheClassifier(name='xgb_bagged1', clf=xgb_bagged)

    xgb_single = XGBoostClassifier(n_estimators=350,
                                   eta=0.1,
                                   max_depth=4,
                                   nthreads=3)
    xgb_single = FoldingClassifier(base_estimator=xgb_single,
                                   stratified=True,
                                   n_folds=10,
                                   parallel_profile='threads-2')
    xgb_single = CacheClassifier(name='xgb_singled1', clf=xgb_single)

    rdf_clf = SklearnClassifier(
        RandomForestClassifier(n_estimators=300, n_jobs=3))
    rdf_folded = FoldingClassifier(base_estimator=rdf_clf,
                                   stratified=True,
                                   parallel_profile='threads-2')
    rdf_bagged = BaggingClassifier(base_estimator=rdf_folded,
                                   n_estimators=n_base_clf,
                                   bootstrap=False)
    rdf_bagged = SklearnClassifier(rdf_bagged)
    rdf_bagged = CacheClassifier(name='rdf_bagged1', clf=rdf_bagged)

    gb_clf = SklearnClassifier(GradientBoostingClassifier(n_estimators=50))
    gb_folded = FoldingClassifier(base_estimator=gb_clf,
                                  stratified=True,
                                  parallel_profile='threads-6')
    gb_bagged = BaggingClassifier(base_estimator=gb_folded,
                                  n_estimators=n_base_clf,
                                  bootstrap=False,
                                  n_jobs=5)
    gb_bagged = SklearnClassifier(gb_bagged)
    gb_bagged = CacheClassifier(name='gb_bagged1', clf=gb_bagged)

    nn_clf = TheanetsClassifier(layers=[300, 300],
                                hidden_dropout=0.03,
                                trainers=[{
                                    'optimize': 'adagrad',
                                    'patience': 5,
                                    'learning_rate': 0.2,
                                    'min_improvement': 0.1,
                                    'momentum': 0.4,
                                    'nesterov': True,
                                    'loss': 'xe'
                                }])
    nn_folded = FoldingClassifier(base_estimator=nn_clf,
                                  stratified=True,
                                  parallel_profile=None)  # 'threads-6')
    nn_bagged = BaggingClassifier(base_estimator=nn_folded,
                                  n_estimators=n_base_clf,
                                  bootstrap=False,
                                  n_jobs=1)
    nn_bagged = CacheClassifier(name='nn_bagged1', clf=nn_bagged)

    nn_single_clf = TheanetsClassifier(layers=[300, 300, 300],
                                       hidden_dropout=0.03,
                                       trainers=[{
                                           'optimize': 'adagrad',
                                           'patience': 5,
                                           'learning_rate': 0.2,
                                           'min_improvement': 0.1,
                                           'momentum': 0.4,
                                           'nesterov': True,
                                           'loss': 'xe'
                                       }])
    nn_single = FoldingClassifier(base_estimator=nn_single_clf,
                                  n_folds=3,
                                  stratified=True)
    nn_single = CacheClassifier(name='nn_single1', clf=nn_single)

    logit_stacker = SklearnClassifier(
        LogisticRegression(penalty='l2', solver='sag'))
    logit_stacker = FoldingClassifier(base_estimator=logit_stacker,
                                      n_folds=n_folds,
                                      stratified=True,
                                      parallel_profile='threads-6')
    logit_stacker = CacheClassifier(name='logit_stacker1', clf=logit_stacker)

    xgb_stacker = XGBoostClassifier(n_estimators=400,
                                    eta=0.1,
                                    max_depth=4,
                                    nthreads=8)
    # HACK
    xgb_stacker = xgb_big_stacker
    xgb_stacker = FoldingClassifier(base_estimator=xgb_stacker,
                                    n_folds=n_folds,
                                    random_state=42,
                                    stratified=True,
                                    parallel_profile='threads-6')
    xgb_stacker = CacheClassifier(name='xgb_stacker1', clf=xgb_stacker)

    #        train1, test1 = data1.get_fold(i)
    #        train2, test2 = data1.get_fold(i)
    #
    #        t_data, t_targets, t_weights =
    data, targets, weights = data1.make_dataset(data2, weights_ratio=1)

    #    xgb_bagged.fit(data, targets, weights)
    #    xgb_report = xgb_bagged.test_on(data, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_base classifier")
    #    output['xgb_base'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    xgb_proba = xgb_report.prediction['clf'][:, 1]
    #    del xgb_bagged, xgb_folded, xgb_clf, xgb_report
    #
    #    xgb_single.fit(data, targets, weights)
    #    xgb_report = xgb_single.test_on(data, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC xgb_single classifier")
    #    output['xgb_single'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    xgb_proba = xgb_report.prediction['clf'][:, 1]
    #    del xgb_single, xgb_report

    nn_single.fit(data, targets, weights)
    nn_report = nn_single.test_on(data, targets, weights)
    nn_report.roc(physics_notion=True).plot(
        new_plot=True, title="ROC AUC nn_single classifier")
    output['nn_single'] = "roc auc:" + str(
        nn_report.compute_metric(metric=RocAuc()))
    # nn_proba = nn_report.prediction['clf'][:, 1]
    del nn_single, nn_report

    #    rdf_bagged.fit(data, targets, weights)
    #    rdf_report = rdf_bagged.test_on(data, targets, weights)
    #    rdf_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC rdf_base classifier")
    #    output['rdf_base'] = "roc auc:" + str(rdf_report.compute_metric(metric=RocAuc()))
    #    rdf_proba = rdf_report.prediction['clf'][:, 1]
    #    del rdf_bagged, rdf_clf, rdf_folded, rdf_report

    #    gb_bagged.fit(data, targets, weights)
    #    gb_report = gb_bagged.test_on(data, targets, weights)
    #    gb_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC gb_base classifier")
    #    output['gb_base'] = "roc auc:" + str(gb_report.compute_metric(metric=RocAuc()))
    #    gb_proba = gb_report.prediction['clf'][:, 1]
    #    del gb_bagged, gb_clf, gb_folded, gb_report

    #    nn_bagged.fit(data, targets, weights)
    #    nn_report = nn_bagged.test_on(data, targets, weights)
    #    nn_report.roc(physics_notion=True).plot(new_plot=True, title="ROC AUC nn_base classifier")
    #    output['nn_base'] = "roc auc:" + str(nn_report.compute_metric(metric=RocAuc()))
    #    nn_proba = nn_report.prediction['clf'][:, 1]
    #    del nn_bagged, nn_clf, nn_folded, nn_report
    #
    #    base_predict = pd.DataFrame({'xgb': xgb_proba,
    #                                 #'rdf': rdf_proba,
    #                                 #'gb': gb_proba,
    #                                 'nn': nn_proba
    #                                 })
    #
    #
    #    xgb_stacker.fit(base_predict, targets, weights)
    #    xgb_report = xgb_stacker.test_on(base_predict, targets, weights)
    #    xgb_report.roc(physics_notion=True).plot(new_plot=True,
    #    title="ROC AUC xgb_stacked classifier")
    #    output['stacker_xgb'] = "roc auc:" + str(xgb_report.compute_metric(metric=RocAuc()))
    #    del xgb_stacker, xgb_report
    #
    #    logit_stacker.fit(base_predict, targets, weights)
    #    logit_report = logit_stacker.test_on(base_predict, targets, weights)
    #    logit_report.roc(physics_notion=True).plot(new_plot=True,
    #    title="ROC AUC logit_stacked classifier")
    #    output['stacker_logit'] = "roc auc:" + str(logit_report.compute_metric(metric=RocAuc()))
    #    del logit_stacker, logit_report

    print output
Beispiel #22
0
def test_xgboost():
    check_classifier(XGBoostClassifier(n_estimators=20), n_classes=2)
    check_classifier(XGBoostClassifier(n_estimators=20), n_classes=4)
    check_regression(XGBoostRegressor(n_estimators=20))