Exemple #1
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [
            145, None,
            check_random_state(None),
            check_random_state(145)
    ]:
        clf1 = XGBoostClassifier(n_estimators=5,
                                 max_depth=1,
                                 subsample=0.1,
                                 random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5,
                                 max_depth=1,
                                 subsample=0.1,
                                 random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(
                clf1.predict_proba(X),
                clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(
                clf1.predict_proba(X),
                clf2.predict_proba(X)), 'seed: {}'.format(random_state)
Exemple #2
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [145, None, check_random_state(None), check_random_state(145)]:
        clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
Exemple #3
0
def test_xgboost_feature_importance():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    clf = XGBoostClassifier(n_estimators=1, max_depth=1)
    clf.fit(X, y)
    importances = clf.get_feature_importances()
    original_features = set(X.columns)
    importances_features = set(importances.index)
    print(original_features, importances_features)
    assert original_features == importances_features, 'feature_importances_ return something wrong'

    assert len(original_features) == len(clf.feature_importances_)
Exemple #4
0
def test_xgboost_feature_importance():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    clf = XGBoostClassifier(n_estimators=1, max_depth=1)
    clf.fit(X, y)
    importances = clf.get_feature_importances()
    original_features = set(X.columns)
    importances_features = set(importances.index)
    print(original_features, importances_features)
    assert original_features == importances_features, 'feature_importances_ return something wrong'

    assert len(original_features) == len(clf.feature_importances_)
Exemple #5
0
def test_feature_importances():
    clf = XGBoostClassifier()
    X, y, sample_weight = generate_classification_data()
    clf.fit(X, y, sample_weight=sample_weight)
    # checking feature importance (three ways)

    res_default = clf.xgboost_classifier.get_fscore()
    res2 = clf._get_fscore()
    res3 = clf.feature_importances_

    assert res_default == res2, res_default
    for i, val in enumerate(res3):
        if val > 0.0:
            assert val == res_default['f' + str(i)]
Exemple #6
0
def test_xgboost_works_with_different_dtypes():
    dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
    for dtype in dtypes:
        X, y, weights = generate_classification_data(n_classes=2, distance=5)
        clf = XGBoostClassifier(n_estimators=10)
        clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype))
        probabilities = clf.predict_proba(X.astype(dtype))

    # testing single pandas.DataFrame with different dtypes
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    import pandas
    X = pandas.DataFrame()
    for dtype in dtypes:
        X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
    clf = XGBoostClassifier(n_estimators=10)
    clf.fit(X, y, sample_weight=weights)
    probabilities = clf.predict_proba(X)
Exemple #7
0
def test_xgboost_works_with_different_dtypes():
    dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
    for dtype in dtypes:
        X, y, weights = generate_classification_data(n_classes=2, distance=5)
        clf = XGBoostClassifier(n_estimators=10)
        clf.fit(X.astype(dtype=dtype),
                y.astype(dtype=dtype),
                sample_weight=weights.astype(dtype))
        probabilities = clf.predict_proba(X.astype(dtype))

    # testing single pandas.DataFrame with different dtypes
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    import pandas
    X = pandas.DataFrame()
    for dtype in dtypes:
        X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
    clf = XGBoostClassifier(n_estimators=10)
    clf.fit(X, y, sample_weight=weights)
    probabilities = clf.predict_proba(X)
Exemple #8
0
 if ii==1 :
   train= trainFeaturesObvious
   Var='Mass'
 if ii==2 :
   train= trainFeaturesHH
   Var='HH'
 xgb = XGBoostClassifier(train) #,
 original = xgboriginal.XGBClassifier(train)
 """
          n_estimators =  200,
          eta = 0.1,
          max_depth = 7,
          subsample = 0.9,
          colsample = 0.6)
 """
 xgb.fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64))) 
 prob = xgb.predict_proba(valdatasetmix[train].astype(np.float64) )
 if ii==0 : reportAll = xgb.test_on(traindatasetmix[trainFeaturesplot].astype(np.float64), traindatasetmix.target.astype(np.bool))
 if ii==1 : reportObvious = xgb.test_on(traindatasetmix[trainFeaturesObvious].astype(np.float64), traindatasetmix.target.astype(np.bool))
 if ii==2 : reportHH = xgb.test_on(traindatasetmix[trainFeaturesHH].astype(np.float64), traindatasetmix.target.astype(np.bool))
 # compatible with lustr/lxplus
 #features =  ['costhst_DiJets[0]_HH', 'costhst_Jets[0]_DiJets[0]', 'costhst_Jets[2]_DiJets[1]', 'CSV3', 'CSV4', 'Jets[0].eta()', 'Jets[1].eta()', 'Jets[2].eta()', 'Jets[3].eta()', 'HT_other_jets']
 #dataout = traindatasetmix.rename(index=str, columns={'HHCost':'costhst_DiJets[0]_HH', 'H1Costbb':'costhst_Jets[0]_DiJets[0]', 'H2Costbb':'costhst_Jets[2]_DiJets[1]', 'CSV3':'CSV3', 'CSV4':'CSV4', 'jeteta1':'Jets[0].eta()', 'jeteta2':'Jets[1].eta()', 'jeteta3':'Jets[2].eta()', 'jeteta4':'Jets[3].eta()', 'jetHTrest':'HT_other_jets'})
 #param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
 #num_round = 2
 #original = xgboriginal.XGBClassifier(param, train, num_round).fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64)))
 #proboriginal = original.predict_proba(valdatasetmix[train].astype(np.float64)) 
 #print proboriginal
 #joblib.dump(original, outputCentral+"_"+Var+'.pkl')
 joblib.dump(prob, outputCentral+"_"+Var+'.pkl')
 #pickle.dump(prob, outputCentral+"_"+Var+'.pkl')