Example #1
0
def test_folding_classifier():
    base_ada = SklearnClassifier(AdaBoostClassifier())
    folding_str = FoldingClassifier(base_ada, n_folds=2)
    check_folding(folding_str, True, True, True)

    base_svm = SklearnClassifier(SVC())
    folding_str = FoldingClassifier(base_svm, n_folds=4)
    check_folding(folding_str, True, False, False)
Example #2
0
def test_folding_classifier():
    base_ada = SklearnClassifier(AdaBoostClassifier())
    folding_str = FoldingClassifier(base_ada, n_folds=2)
    check_folding(folding_str, True, True, True)

    base_log_reg = SklearnClassifier(LogisticRegression())
    folding_str = FoldingClassifier(base_log_reg, n_folds=4)
    check_folding(folding_str, True, False, False, False)
Example #3
0
def test_complex_stacking_xgboost():
    # Ada over kFold over xgboost
    base_kfold = FoldingClassifier(base_estimator=XGBoostClassifier())
    check_classifier(SklearnClassifier(
        clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)),
                     has_staged_pp=False,
                     has_importances=False)
Example #4
0
def test_complex_stacking_tmva():
    # Ada over kFold over TMVA
    base_kfold = FoldingClassifier(base_estimator=TMVAClassifier(),
                                   random_state=13)
    check_classifier(SklearnClassifier(
        clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)),
                     has_staged_pp=False,
                     has_importances=False)
Example #5
0
def test_complex_stacking_mn():
    # Ada over kFold over MatrixNet
    base_kfold = FoldingClassifier(base_estimator=MatrixNetClassifier(
        iterations=30))
    check_classifier(SklearnClassifier(
        clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)),
                     has_staged_pp=False,
                     has_importances=False)
    def fit_one(self, data, model_y, model_stereo):


        event_ids = numpy.unique(data.EventID.values)

        if self.train_size != None:
            event_ids_train, event_ids_test= train_test_split(event_ids, train_size=self.train_size, random_state=42)
        else:
            event_ids_test = event_ids

        # fit train tracks
        if self.train_size != None:

            tracks_train = {}

            p = Pool(self.processes)
            results_train = p.map(tracks_reconstruction, zip(event_ids_train,
                                                       [data]*len(event_ids_train),
                                                       [model_y]*len(event_ids_train),
                                                       [model_stereo]*len(event_ids_train)))
            tracks_train = merge_dicts(results_train)

        # train clf
        if self.train_size != None:

            sc = SuperCombinator()

            combination_data = sc.data_collection(tracks_train, data)

            X_data = combination_data[combination_data.columns[:-1]].values
            y_data = combination_data.label.values

            xgb_base = XGBoostClassifier(n_estimators=1000, colsample=0.7, eta=0.01, nthreads=1,
                             subsample=0.7, max_depth=8)
            folding = FoldingClassifier(xgb_base, n_folds=10, random_state=11)
            folding.fit(X_data, y_data)

            clf = folding.estimators[0]

        else:
            clf = None



        # fit test tracks
        tracks_test = {}

        p = Pool(self.processes)
        results_test = p.map(tracks_reconstruction, zip(event_ids_test,
                                                   [data]*len(event_ids_test),
                                                   [model_y]*len(event_ids_test),
                                                   [model_stereo]*len(event_ids_test)))
        tracks_test = merge_dicts(results_test)


        # quality
        p = Pool(self.processes)
        effs = p.map(get_eff_value, zip(event_ids_test,
                                       [data]*len(event_ids_test),
                                       [tracks_test]*len(event_ids_test),
                                       [clf]*len(event_ids_test)))

        eff = 100. * numpy.array(effs).sum() / len(effs)

        return eff
Example #7
0
    lds = LabeledDataStorage(X_test, y_test, w_test)
    # CLASSIFIER
    clf_stacking = SklearnClassifier(RandomForestClassifier(n_estimators=5000, bootstrap=False,
                                                            n_jobs=7))
    # clf_stacking = XGBoostClassifier(n_estimators=700, eta=0.1, nthreads=8,
    #                                 subsample=0.5
    #                                 )
    # clf_stacking='nn'
    clf = Mayou(base_estimators={'xgb': None}, bagging_base=None, bagging_stack=8,
                stacking=clf_stacking, features_stack=branch_names,
                transform=False, transform_pred=False)
    # clf = SklearnClassifier(GaussianNB())
    # clf = SklearnClassifier(BaggingClassifier(n_jobs=1, max_features=1.,
    # bootstrap=False, base_estimator=clf, n_estimators=20, max_samples=0.1))
    # clf = XGBoostClassifier(n_estimators=400, eta=0.1, nthreads=6)
    # clf = SklearnClassifier(BaggingClassifier(clf, max_samples=0.8))
    # clf = SklearnClassifier(NuSVC(cache_size=1000000))
    # clf = SklearnClassifier(clf)
    if folding:
        X_train = X_test = X
        y_train = y_test = y
        w_train = w_test = w
        clf = FoldingClassifier(clf, n_folds=5)

    clf.fit(X_train, y_train, w_train)


#    report.features_correlation_matrix().plot(new_plot=True)

    plt.show()
Example #8
0
def train_one_vs_one(base_estimators,
                     data_b,
                     data_c,
                     data_light,
                     prefix='bdt',
                     n_folds=2,
                     folding=True,
                     features=None,
                     profile=None):

    data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]),
                                      [1] * len(data_b) + [0] * len(data_c))
    data_c_light_lds = LabeledDataStorage(pandas.concat(
        [data_c, data_light]), [1] * len(data_c) + [0] * len(data_light))
    data_b_light_lds = LabeledDataStorage(pandas.concat(
        [data_b, data_light]), [1] * len(data_b) + [0] * len(data_light))

    if folding:
        tt_folding_b_c = FoldingClassifier(base_estimators[0],
                                           n_folds=n_folds,
                                           random_state=11,
                                           parallel_profile=profile,
                                           features=features)
        tt_folding_c_light = FoldingClassifier(base_estimators[1],
                                               n_folds=n_folds,
                                               random_state=11,
                                               parallel_profile=profile,
                                               features=features)
        tt_folding_b_light = FoldingClassifier(base_estimators[2],
                                               n_folds=n_folds,
                                               random_state=11,
                                               parallel_profile=profile,
                                               features=features)
    else:
        tt_folding_b_c = base_estimators[0]
        tt_folding_b_c.features = features
        tt_folding_c_light = base_estimators[1]
        tt_folding_c_light.features = features
        tt_folding_b_light = base_estimators[2]
        tt_folding_b_light.features = features

    tt_folding_b_c.fit_lds(data_b_c_lds)

    tt_folding_c_light.fit_lds(data_c_light_lds)

    tt_folding_b_light.fit_lds(data_b_light_lds)

    probs_b_c = numpy.concatenate([
        tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])),
        tt_folding_b_c.predict_proba(data_light)
    ])[:, 1]
    probs_c_light = numpy.concatenate([
        tt_folding_c_light.predict_proba(data_b),
        tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light]))
    ])[:, 1]
    probs_b_light = tt_folding_b_light.predict_proba(
        pandas.concat([data_b, data_light]))[:, 1]
    probs_b_light = numpy.concatenate([
        probs_b_light[:len(data_b)],
        tt_folding_b_light.predict_proba(data_c)[:, 1],
        probs_b_light[len(data_b):]
    ])

    additional_columns = pandas.DataFrame({
        prefix + '_b_c': probs_b_c,
        prefix + '_b_light': probs_b_light,
        prefix + '_c_light': probs_c_light
    })
    return additional_columns
Example #9
0
def test_complex_stacking_tmva():
    # Ada over kFold over TMVA
    base_kfold = FoldingClassifier(base_estimator=TMVAClassifier(factory_options="Silent=True:V=False:DrawProgressBar=False",
                                                                 method='kBDT', NTrees=10), random_state=13)
    check_classifier(SklearnClassifier(clf=AdaBoostClassifier(base_estimator=base_kfold, n_estimators=3)),
                     has_staged_pp=False, has_importances=False)
    def fit_one(self, data, model_y, model_stereo):

        event_ids = numpy.unique(data.EventID.values)

        if self.train_size != None:
            event_ids_train, event_ids_test = train_test_split(
                event_ids, train_size=self.train_size, random_state=42)
        else:
            event_ids_test = event_ids

        # fit train tracks
        if self.train_size != None:

            tracks_train = {}

            p = Pool(self.processes)
            results_train = p.map(
                tracks_reconstruction,
                zip(event_ids_train, [data] * len(event_ids_train),
                    [model_y] * len(event_ids_train),
                    [model_stereo] * len(event_ids_train)))
            tracks_train = merge_dicts(results_train)

        # train clf
        if self.train_size != None:

            sc = SuperCombinator()

            combination_data = sc.data_collection(tracks_train, data)

            X_data = combination_data[combination_data.columns[:-1]].values
            y_data = combination_data.label.values

            xgb_base = XGBoostClassifier(n_estimators=1000,
                                         colsample=0.7,
                                         eta=0.01,
                                         nthreads=1,
                                         subsample=0.7,
                                         max_depth=8)
            folding = FoldingClassifier(xgb_base, n_folds=10, random_state=11)
            folding.fit(X_data, y_data)

            clf = folding.estimators[0]

        else:
            clf = None

        # fit test tracks
        tracks_test = {}

        p = Pool(self.processes)
        results_test = p.map(
            tracks_reconstruction,
            zip(event_ids_test, [data] * len(event_ids_test),
                [model_y] * len(event_ids_test),
                [model_stereo] * len(event_ids_test)))
        tracks_test = merge_dicts(results_test)

        # quality
        p = Pool(self.processes)
        effs = p.map(
            get_eff_value,
            zip(event_ids_test, [data] * len(event_ids_test),
                [tracks_test] * len(event_ids_test),
                [clf] * len(event_ids_test)))

        eff = 100. * numpy.array(effs).sum() / len(effs)

        return eff
Example #11
0
def train_one_vs_one(base_estimators, data_b, data_c, data_light,
                     prefix='bdt', n_folds=2, folding=True, features=None, profile=None):

    data_b_c_lds = LabeledDataStorage(pandas.concat([data_b, data_c]), [1] * len(data_b) + [0] * len(data_c))
    data_c_light_lds = LabeledDataStorage(pandas.concat([data_c, data_light]), [1] * len(data_c) + [0] * len(data_light))
    data_b_light_lds = LabeledDataStorage(pandas.concat([data_b, data_light]), [1] * len(data_b) + [0] * len(data_light))

    if folding:        
        tt_folding_b_c = FoldingClassifier(base_estimators[0], n_folds=n_folds, random_state=11, parallel_profile=profile, 
                                           features=features)
        tt_folding_c_light = FoldingClassifier(base_estimators[1], n_folds=n_folds, random_state=11, parallel_profile=profile, 
                                               features=features)
        tt_folding_b_light = FoldingClassifier(base_estimators[2], n_folds=n_folds, random_state=11, parallel_profile=profile, 
                                               features=features)
    else:
        tt_folding_b_c = base_estimators[0]
        tt_folding_b_c.features = features
        tt_folding_c_light = base_estimators[1]
        tt_folding_c_light.features = features
        tt_folding_b_light = base_estimators[2]
        tt_folding_b_light.features = features
        
    tt_folding_b_c.fit_lds(data_b_c_lds)
    
    tt_folding_c_light.fit_lds(data_c_light_lds)

    tt_folding_b_light.fit_lds(data_b_light_lds)

    probs_b_c = numpy.concatenate([tt_folding_b_c.predict_proba(pandas.concat([data_b, data_c])),
                                   tt_folding_b_c.predict_proba(data_light)])[:, 1]
    probs_c_light = numpy.concatenate([tt_folding_c_light.predict_proba(data_b), 
                                       tt_folding_c_light.predict_proba(pandas.concat([data_c, data_light]))])[:, 1]
    probs_b_light = tt_folding_b_light.predict_proba(pandas.concat([data_b, data_light]))[:, 1]
    probs_b_light = numpy.concatenate([probs_b_light[:len(data_b)], tt_folding_b_light.predict_proba(data_c)[:, 1], 
                                       probs_b_light[len(data_b):]])
    
    additional_columns = pandas.DataFrame({prefix + '_b_c': probs_b_c,
                                           prefix + '_b_light': probs_b_light,
                                           prefix + '_c_light': probs_c_light})
    return additional_columns