def test_basic_xgboost(): X, y, w = generate_classification_data(n_classes=2) clf = XGBoostClassifier(n_estimators=10).fit(X, y) clf.predict(X) clf.predict_proba(X) # testing that returned features in importances are correct and in the same order assert numpy.all(clf.features == clf.get_feature_importances().index)
def test_xgboost_random_states(): X, y, weights = generate_classification_data(n_classes=2, distance=5) for random_state in [ 145, None, check_random_state(None), check_random_state(145) ]: clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf1.fit(X, y) clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf2.fit(X, y) if isinstance(random_state, numpy.random.RandomState): assert not numpy.allclose( clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state) else: assert numpy.allclose( clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
def very_basic_xgboost_test(): X, y, w = generate_classification_data(n_classes=2) clf = XGBoostClassifier(n_estimators=10).fit(X, y) clf.predict(X) clf.predict_proba(X) # testing that returned features in importances are correct and in the same order assert numpy.all(clf.features == clf.get_feature_importances().index)
def test_xgboost_random_states(): X, y, weights = generate_classification_data(n_classes=2, distance=5) for random_state in [145, None, check_random_state(None), check_random_state(145)]: clf1 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf1.fit(X, y) clf2 = XGBoostClassifier(n_estimators=5, max_depth=1, subsample=0.1, random_state=random_state) clf2.fit(X, y) if isinstance(random_state, numpy.random.RandomState): assert not numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state) else: assert numpy.allclose(clf1.predict_proba(X), clf2.predict_proba(X)), 'seed: {}'.format(random_state)
def test_xgboost_works_with_different_dtypes(): dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32'] for dtype in dtypes: X, y, weights = generate_classification_data(n_classes=2, distance=5) clf = XGBoostClassifier(n_estimators=10) clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype)) probabilities = clf.predict_proba(X.astype(dtype)) # testing single pandas.DataFrame with different dtypes X, y, weights = generate_classification_data(n_classes=2, distance=5) import pandas X = pandas.DataFrame() for dtype in dtypes: X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype) clf = XGBoostClassifier(n_estimators=10) clf.fit(X, y, sample_weight=weights) probabilities = clf.predict_proba(X)
def test_xgboost_works_with_different_dtypes(): dtypes = ['float32', 'float64', 'int32', 'int64', 'uint32'] for dtype in dtypes: X, y, weights = generate_classification_data(n_classes=2, distance=5) clf = XGBoostClassifier(n_estimators=10) clf.fit(X.astype(dtype=dtype), y.astype(dtype=dtype), sample_weight=weights.astype(dtype)) probabilities = clf.predict_proba(X.astype(dtype)) # testing single pandas.DataFrame with different dtypes X, y, weights = generate_classification_data(n_classes=2, distance=5) import pandas X = pandas.DataFrame() for dtype in dtypes: X[dtype] = numpy.random.normal(0, 10, size=len(y)).astype(dtype) clf = XGBoostClassifier(n_estimators=10) clf.fit(X, y, sample_weight=weights) probabilities = clf.predict_proba(X)
train= trainFeaturesObvious Var='Mass' if ii==2 : train= trainFeaturesHH Var='HH' xgb = XGBoostClassifier(train) #, original = xgboriginal.XGBClassifier(train) """ n_estimators = 200, eta = 0.1, max_depth = 7, subsample = 0.9, colsample = 0.6) """ xgb.fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64))) prob = xgb.predict_proba(valdatasetmix[train].astype(np.float64) ) if ii==0 : reportAll = xgb.test_on(traindatasetmix[trainFeaturesplot].astype(np.float64), traindatasetmix.target.astype(np.bool)) if ii==1 : reportObvious = xgb.test_on(traindatasetmix[trainFeaturesObvious].astype(np.float64), traindatasetmix.target.astype(np.bool)) if ii==2 : reportHH = xgb.test_on(traindatasetmix[trainFeaturesHH].astype(np.float64), traindatasetmix.target.astype(np.bool)) # compatible with lustr/lxplus #features = ['costhst_DiJets[0]_HH', 'costhst_Jets[0]_DiJets[0]', 'costhst_Jets[2]_DiJets[1]', 'CSV3', 'CSV4', 'Jets[0].eta()', 'Jets[1].eta()', 'Jets[2].eta()', 'Jets[3].eta()', 'HT_other_jets'] #dataout = traindatasetmix.rename(index=str, columns={'HHCost':'costhst_DiJets[0]_HH', 'H1Costbb':'costhst_Jets[0]_DiJets[0]', 'H2Costbb':'costhst_Jets[2]_DiJets[1]', 'CSV3':'CSV3', 'CSV4':'CSV4', 'jeteta1':'Jets[0].eta()', 'jeteta2':'Jets[1].eta()', 'jeteta3':'Jets[2].eta()', 'jeteta4':'Jets[3].eta()', 'jetHTrest':'HT_other_jets'}) #param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } #num_round = 2 #original = xgboriginal.XGBClassifier(param, train, num_round).fit(traindatasetmix[train].astype(np.float64), traindatasetmix.target.astype(np.bool), sample_weight= (traindatasetmix[weights].astype(np.float64))) #proboriginal = original.predict_proba(valdatasetmix[train].astype(np.float64)) #print proboriginal #joblib.dump(original, outputCentral+"_"+Var+'.pkl') joblib.dump(prob, outputCentral+"_"+Var+'.pkl') #pickle.dump(prob, outputCentral+"_"+Var+'.pkl') #pickle.dump(original, open(outputCentral+"_"+Var+'.pkl', "wb"))