y[signal.shape[0]:] *= -1
permute = RNG.permutation(y.shape[0])
X = X[permute]
y = y[permute]

# Use all dataset for training
X_train, y_train, w_train = X, y, w

# Declare BDT - we are going to use AdaBoost Decision Tree
dt = DecisionTreeClassifier(max_depth=3,
                            min_samples_leaf=0.05*len(X_train))
bdt = AdaBoostClassifier(dt,
                         algorithm='SAMME',
                         n_estimators=800,
                         learning_rate=0.5)

# Train BDT
bdt.fit(X_train, y_train)

# Save BDT to pickle file
with open('bdt_sklearn_to_tmva_example.pkl', 'wb') as fid:
    cPickle.dump(bdt, fid)


# Save BDT to TMVA xml file 
# Note:
#    - declare input variable names and their type 
#    - variable order is important for TMVA
convert_bdt_sklearn_tmva(bdt, [('var1', 'F'), ('var2', 'F')], 'bdt_sklearn_to_tmva_example.xml')

Example #2
0
import cPickle

from skTMVA import convert_bdt_sklearn_tmva

# load decision tree
bdt_path = '/Users/musthero/Documents/Yura/Applications/tmva_local/electrons_v5_VeryTightLH_20per.pkl'
with open(bdt_path, 'rb') as fid:
    bdt = cPickle.load(fid)

# specify input variable list
var_list = [('m_el_pt', 'F'), ('m_el_eta', 'F'), ('m_el_sigd0PV', 'F'),
            ('m_el_z0SinTheta', 'F'), ('m_el_etcone20Dpt', 'F'),
            ('m_el_ptcone20Dpt', 'F')]

# specify output TMVA xml-file
tmva_outfile_xml = 'SKLearn_BDT_electons.weights.xml'

# save scikit-learn trained BDT classifier to TMVA xml-file
convert_bdt_sklearn_tmva(bdt, var_list, tmva_outfile_xml)
    def train(self, train_data, classification_variables, variable_dict, sample_name, grid_search):
        """
        Definition:
        -----------
            Training method for sklBDT; it pickles the model into the "pickle" sub-folder

        Args:
        -----
            train_data = dictionary, containing "X", "y", "w" for the training set, where:
                X = ndarray of dim (# training examples, # features)
                y = array of dim (# training examples) with target values
                w = array of dim (# training examples) with event weights
            classification_variables = list of names of variables used for classification
            variable_dict = ordered dict, mapping all the branches from the TTree to their type
            sample_name = string that specifies the file name of the sample being trained on
        """
        # -- Train:
        logging.getLogger("skl_BDT").info("Training...")

        if grid_search:
            # Thoughts:
            # -- min_samples_leaf is supposedly faster to train than min_samples_split
            # -- could first tune optimum number of trees
            #    then tune max_depth and min_samples to save on combinatorics
            parameters = {"n_estimators":[100, 150, 200, 250, 300],
                          "max_depth":[2, 4, 6, 8, 10],
                          "min_samples_leaf":[20, 30, 40, 50, 60]}
            fit_params = {"sample_weight":train_data["w"]}
            # Run grid search over provided ranges
            logging.getLogger("skl_BDT").info("Running grid search parameter optimisation...")
            grid_search = GridSearchCV(
                estimator=GradientBoostingClassifier(
                    learning_rate=0.2, min_samples_leaf=50, max_features="sqrt", subsample=0.8, random_state=10
                ),
                param_grid=parameters, fit_params=fit_params, scoring="roc_auc", n_jobs=-1, iid=False, cv=3, verbose=1
            )
            grid_search.fit(train_data["X"], train_data["y"])
            for param_name in parameters.keys():
                if grid_search.best_params_[param_name] in [ parameters[param_name][0], parameters[param_name][-1] ]:
                    logging.getLogger("skl_BDT").info("Best value of {} is at limit of considered range!".format(param_name))
                parameters[param_name] = grid_search.best_params_[param_name]
            for param_name in parameters.keys():
                logging.getLogger("skl_BDT").info("... {}: {}".format(param_name, parameters[param_name]))

        else:
            classifier = GradientBoostingClassifier(
                n_estimators=300, # was n_estimators=300
                max_depth=10, # was max_depth=15
                min_samples_leaf=40, # was min_samples_split=0.5 * len(train_data["y"])
                verbose=1
                )
            classifier.fit(train_data["X"], train_data["y"], sample_weight=train_data["w"])

        # -- Dump output to pickle
        ensure_directory(os.path.join(self.output_directory, sample_name, self.name, "classifier"))
        joblib.dump(classifier, os.path.join(self.output_directory, sample_name, self.name, "classifier", "skl_BDT_clf.pkl"), protocol=cPickle.HIGHEST_PROTOCOL)

        # Save BDT to TMVA xml file
        # -- variable order is important for TMVA
        # -- can't yet reproduce scikit-learn output in TMVA(!)
        try:
            from skTMVA import convert_bdt_sklearn_tmva
            logging.getLogger("skl_BDT").info("Exporting output to TMVA XML file")
            variables = [ (v,variable_dict[v]) for v in classification_variables ]
            convert_bdt_sklearn_tmva(
                classifier,
                variables,
                os.path.join(self.output_directory, sample_name, self.name, "classifier", "skl_BDT_TMVA.weights.xml")
                )
        except ImportError:
            logging.getLogger("skl_BDT").info("Could not import skTMVA. Skipping export to TMVA output.")
Example #4
0
y = np.ones(X.shape[0])
w = RNG.randint(1, 10, n_events * 2)
y[signal.shape[0]:] *= -1
permute = RNG.permutation(y.shape[0])
X = X[permute]
y = y[permute]

# Use all dataset for training
X_train, y_train, w_train = X, y, w

# Declare BDT - we are going to use AdaBoost Decision Tree
bdt = GradientBoostingClassifier(n_estimators=200,
                                 learning_rate=0.5,
                                 min_samples_leaf=0.05 * len(X_train),
                                 max_depth=3,
                                 random_state=0)

# Train BDT
bdt.fit(X_train, y_train)

# Save BDT to pickle file
with open('bdt_sklearn_to_tmva_example.pkl', 'wb') as fid:
    cPickle.dump(bdt, fid)

# Save BDT to TMVA xml file
# Note:
#    - declare input variable names and their type
#    - variable order is important for TMVA
convert_bdt_sklearn_tmva(bdt, [('var1', 'F'), ('var2', 'F')],
                         'bdt_sklearn_to_tmva_example.xml')
         bins=100,
         weights=w_test[(y_test < 0.5)],
         range=[-1, 1],
         histtype='stepfilled',
         label='B (test)',
         color='sandybrown',
         normed=1)

plt.xlabel("predict proba")
plt.ylabel("Arbitrary units")
plt.ylim(0.0, 6.0)
plt.legend(loc='best')
plt.savefig('BDT_score_12042019_endcap.png')

##################################################################################################################
##################################################################################################################

#####################################################################################################
# convert sklearn model to TMVA weights
#################################################################################################
from skTMVA import convert_bdt_sklearn_tmva
convert_bdt_sklearn_tmva(model, [('SCRawE', 'F'), ('r9', 'F'),
                                 ('sigmaIetaIeta', 'F'), ('etaWidth', 'F'),
                                 ('phiWidth', 'F'), ('covIEtaIPhi', 'F'),
                                 ('s4', 'F'), ('phoIso03', 'F'),
                                 ('chgIsoWrtChosenVtx', 'F'),
                                 ('chgIsoWrtWorstVtx', 'F'), ('scEta', 'F'),
                                 ('rho', 'F'), ('esEffSigmaRR', 'F'),
                                 ('esEnergyOverRawE', 'F')],
                         'bdt_sklearn_to_tmva_nTree100.xml')
import cPickle

from skTMVA import convert_bdt_sklearn_tmva

# load decision tree
bdt_path = '/Users/musthero/Documents/Yura/Applications/tmva_local/electrons_v5_VeryTightLH_20per.pkl'
with open(bdt_path, 'rb') as fid:
    bdt = cPickle.load(fid)

# specify input variable list
var_list = [ 
                ('m_el_pt', 'F'),
                ('m_el_eta', 'F'), 
                ('m_el_sigd0PV', 'F'), 
                ('m_el_z0SinTheta', 'F'), 
                ('m_el_etcone20Dpt', 'F'), 
                ('m_el_ptcone20Dpt', 'F')
            ]

# specify output TMVA xml-file
tmva_outfile_xml = 'SKLearn_BDT_electons.weights.xml'

# save scikit-learn trained BDT classifier to TMVA xml-file
convert_bdt_sklearn_tmva(bdt, var_list, tmva_outfile_xml)
Example #7
0
data, test = train_test_split(data, shuffle=False, test_size=0.2)
test.to_root('test_sample.root')

#scaler1 = MinMaxScaler().fit(data.drop(['trk_isTrue','trk_mva'],axis=1))
#joblib.dump(scaler1, "scaler.pkl")

train_x = data.drop(['isSignal', 'TransMass'], axis=1)
#train_x=pd.DataFrame(data=scaler1.transform(train_x),columns=train_x.columns.values)
train_y = data['isSignal']

#clf = RandomForestClassifier(n_estimators=100,verbose=1,n_jobs=-1,class_weight='balanced')
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(verbose=1, n_estimators=500)

clf.fit(train_x, train_y)

preds = clf.predict(train_x[:5])

print preds

import cPickle
with open('BDT_500.pkl', 'wb') as f:
    cPickle.dump(clf, f)

from skTMVA import convert_bdt_sklearn_tmva
convert_bdt_sklearn_tmva(clf, [('Tau_pt', 'F'), ('Bjet_pt', 'F'), ('MET', 'F'),
                               ('DPhi_tau_miss', 'F'), ('DPhi_bjet_miss', 'F'),
                               ('Dist_tau_bjet', 'F'), ('Upsilon', 'F'),
                               ('Transmass', 'F')], 'BDT_500.xml')
variable_names = [ "abs_eta_j", "abs_eta_jb", "Delta_eta_jb", "idx_by_mH", "idx_by_pT", "idx_by_pT_jb", "m_jb", "pT_j", "pT_jb" ]
variables = OrderedDict( (k,array.array("f",[0])) for k in variable_names )

# Load scikit-learn from pickle
skl_bdt = joblib.load("output/merged_inputs/skl_BDT/classifier/skl_BDT_clf.pkl")

# # Set up ROOT TMVAs
# reader_ROOT = ROOT.TMVA.Reader()
# for variable_name, variable in variables.items():
#     reader_ROOT.AddVariable(variable_name, variable)
# reader_ROOT.BookMVA("scikit-learn", "output/merged_inputs/root_tmva/weights/TMVAClassification_BDT.weights.xml")

from skTMVA import convert_bdt_sklearn_tmva
logging.getLogger("skl_BDT").info("Exporting output to TMVA XML file")
tree_variables = [ (v,"D") for v in variable_names ]
convert_bdt_sklearn_tmva(skl_bdt, tree_variables, "converted_skl_BDT_TMVA.weights.xml")

reader_skl = ROOT.TMVA.Reader()
for variable_name, variable in variables.items():
    reader_skl.AddVariable(variable_name, variable)
reader_skl.BookMVA("converted", "converted_skl_BDT_TMVA.weights.xml")

for input_filename in [ glob.glob("inputs/*X275*root")[0] ]:
    logger.info("Now considering {}".format(input_filename))
    input_file = ROOT.TFile.Open(input_filename, "READ")
    event_tree = input_file.Get("events_1tag")
    for idx_evt, event in enumerate(event_tree):
        if idx_evt > 5 : break
        n_pairs = len([ x for x in event.isCorrect ])
        scores = dict( (k,[]) for k in ["ROOT", "scikit-learn", "converted"] )
        for idx_pair in range(n_pairs):