Exemple #1
0
def test_basic_xgboost():
    X, y, w = generate_classification_data(n_classes=2)
    clf = XGBoostClassifier(n_estimators=10).fit(X, y)
    clf.predict(X)
    clf.predict_proba(X)
    # testing that returned features in importances are correct and in the same order
    assert numpy.all(clf.features == clf.get_feature_importances().index)
Exemple #2
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [
            145, None,
            check_random_state(None),
            check_random_state(145)
    ]:
        clf1 = XGBoostClassifier(n_estimators=5,
                                 max_depth=1,
                                 subsample=0.1,
                                 random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5,
                                 max_depth=1,
                                 subsample=0.1,
                                 random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(
                clf1.predict_proba(X),
                clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(
                clf1.predict_proba(X),
                clf2.predict_proba(X)), 'seed: {}'.format(random_state)
Exemple #3
0
def very_basic_xgboost_test():
    X, y, w = generate_classification_data(n_classes=2)
    clf = XGBoostClassifier(n_estimators=10).fit(X, y)
    clf.predict(X)
    clf.predict_proba(X)
    # testing that returned features in importances are correct and in the same order
    assert numpy.all(clf.features == clf.get_feature_importances().index)
Exemple #4
0
def test_xgboost_random_states():
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    for random_state in [145, None, check_random_state(None), check_random_state(145)]:
        clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf1.fit(X, y)
        clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state)
        clf2.fit(X, y)
        if isinstance(random_state, numpy.random.RandomState):
            assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
        else:
            assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
Exemple #5
0
def test_xgboost_works_with_different_dtypes():
    dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
    for dtype in dtypes:
        X, y, weights = generate_classification_data(n_classes=2, distance=5)
        clf = XGBoostClassifier(n_estimators=10)
        clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype))
        probabilities = clf.predict_proba(X.astype(dtype))

    # testing single pandas.DataFrame with different dtypes
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    import pandas
    X = pandas.DataFrame()
    for dtype in dtypes:
        X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
    clf = XGBoostClassifier(n_estimators=10)
    clf.fit(X, y, sample_weight=weights)
    probabilities = clf.predict_proba(X)
Exemple #6
0
def test_xgboost_works_with_different_dtypes():
    dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32']
    for dtype in dtypes:
        X, y, weights = generate_classification_data(n_classes=2, distance=5)
        clf = XGBoostClassifier(n_estimators=10)
        clf.fit(X.astype(dtype=dtype),
                y.astype(dtype=dtype),
                sample_weight=weights.astype(dtype))
        probabilities = clf.predict_proba(X.astype(dtype))

    # testing single pandas.DataFrame with different dtypes
    X, y, weights = generate_classification_data(n_classes=2, distance=5)
    import pandas
    X = pandas.DataFrame()
    for dtype in dtypes:
        X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype)
    clf = XGBoostClassifier(n_estimators=10)
    clf.fit(X, y, sample_weight=weights)
    probabilities = clf.predict_proba(X)
Exemple #7
0
   train= trainFeaturesObvious
   Var='Mass'
 if ii==2 :
   train= trainFeaturesHH
   Var='HH'
 xgb = XGBoostClassifier(train) #,
 original = xgboriginal.XGBClassifier(train)
 """
          n_estimators =  200,
          eta = 0.1,
          max_depth = 7,
          subsample = 0.9,
          colsample = 0.6)
 """
 xgb.fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64))) 
 prob = xgb.predict_proba(valdatasetmix[train].astype(np.float64) )
 if ii==0 : reportAll = xgb.test_on(traindatasetmix[trainFeaturesplot].astype(np.float64), traindatasetmix.target.astype(np.bool))
 if ii==1 : reportObvious = xgb.test_on(traindatasetmix[trainFeaturesObvious].astype(np.float64), traindatasetmix.target.astype(np.bool))
 if ii==2 : reportHH = xgb.test_on(traindatasetmix[trainFeaturesHH].astype(np.float64), traindatasetmix.target.astype(np.bool))
 # compatible with lustr/lxplus
 #features =  ['costhst_DiJets[0]_HH', 'costhst_Jets[0]_DiJets[0]', 'costhst_Jets[2]_DiJets[1]', 'CSV3', 'CSV4', 'Jets[0].eta()', 'Jets[1].eta()', 'Jets[2].eta()', 'Jets[3].eta()', 'HT_other_jets']
 #dataout = traindatasetmix.rename(index=str, columns={'HHCost':'costhst_DiJets[0]_HH', 'H1Costbb':'costhst_Jets[0]_DiJets[0]', 'H2Costbb':'costhst_Jets[2]_DiJets[1]', 'CSV3':'CSV3', 'CSV4':'CSV4', 'jeteta1':'Jets[0].eta()', 'jeteta2':'Jets[1].eta()', 'jeteta3':'Jets[2].eta()', 'jeteta4':'Jets[3].eta()', 'jetHTrest':'HT_other_jets'})
 #param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
 #num_round = 2
 #original = xgboriginal.XGBClassifier(param, train, num_round).fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64)))
 #proboriginal = original.predict_proba(valdatasetmix[train].astype(np.float64)) 
 #print proboriginal
 #joblib.dump(original, outputCentral+"_"+Var+'.pkl')
 joblib.dump(prob, outputCentral+"_"+Var+'.pkl')
 #pickle.dump(prob, outputCentral+"_"+Var+'.pkl')
 #pickle.dump(original, open(outputCentral+"_"+Var+'.pkl', "wb"))