def _make_df(val, key, branches): df = pup.root2panda(val, tree_name, branches = branches + ['HGamEventInfoAuxDyn.yybb_weight']) if mode == 'classification': df['y'] = key elif mode == 'regression': if key == 'bkg': df['y'] = 0 else: df['y'] = int(key[1:]) return df
def plotROC(test_ntuple_path):#, picklename): ''' Definition: ----------- Plot a ROC curve comparison between the old mv2c10 contained in the branch and the newly evaluated one, which is loaded in from a pickle file. Both the .root file and the pickled mv2 array are assumed to be event-flat, not jet-flat. Args: ----- test_ntuple_path: string, the path to the root files used for evaluation picklename: string, the path to the pickle containing the new output of your retrained mv2 ''' # -- import the root file into a df print 'Opening files' df = pup.root2panda(test_ntuple_path, 'bTag_AntiKt2PV0TrackJets', branches=['jet_mv2c10', 'jet_LabDr_HadF']) # -- extract the old mv2c10 branch for comparison oldMV2 = pup.flatten(df['jet_mv2c10']) # -- extract the truth labels truthflav = pup.flatten(df['jet_LabDr_HadF']) # -- open the pickle produced by evaluate_and_store print 'Importing pickle' c00 = pup.flatten(cPickle.load(open('val_Alessandro_c00.pkl', 'rb'))) c07 = pup.flatten(cPickle.load(open('val_Alessandro_c07.pkl', 'rb'))) c15 = pup.flatten(cPickle.load(open('val_Alessandro_c15.pkl', 'rb'))) # -- this allows you to check performance on b VS light # -- change it, if you want to look at a different performance print 'Slicing' bl_selection = (truthflav == 0) | (truthflav == 5) print 'Plotting' plot(bl_selection, 'bl', truthflav, oldMV2, c00, c07, c15) print 'Slicing' bc_selection = (truthflav == 4) | (truthflav == 5) print 'Plotting' plot(bc_selection, 'bc', truthflav, oldMV2, c00, c07, c15)
def plotROC(test_ntuple_path): #, picklename): ''' Definition: ----------- Plot a ROC curve comparison between the old mv2c10 contained in the branch and the newly evaluated one, which is loaded in from a pickle file. Both the .root file and the pickled mv2 array are assumed to be event-flat, not jet-flat. Args: ----- test_ntuple_path: string, the path to the root files used for evaluation picklename: string, the path to the pickle containing the new output of your retrained mv2 ''' # -- import the root file into a df print 'Opening files' df = pup.root2panda(test_ntuple_path, 'bTag_AntiKt2PV0TrackJets', branches=['jet_mv2c10', 'jet_LabDr_HadF']) # -- extract the old mv2c10 branch for comparison oldMV2 = pup.flatten(df['jet_mv2c10']) # -- extract the truth labels truthflav = pup.flatten(df['jet_LabDr_HadF']) # -- open the pickle produced by evaluate_and_store print 'Importing pickle' c00 = pup.flatten(cPickle.load(open('val_Alessandro_c00.pkl', 'rb'))) c07 = pup.flatten(cPickle.load(open('val_Alessandro_c07.pkl', 'rb'))) c15 = pup.flatten(cPickle.load(open('val_Alessandro_c15.pkl', 'rb'))) # -- this allows you to check performance on b VS light # -- change it, if you want to look at a different performance print 'Slicing' bl_selection = (truthflav == 0) | (truthflav == 5) print 'Plotting' plot(bl_selection, 'bl', truthflav, oldMV2, c00, c07, c15) print 'Slicing' bc_selection = (truthflav == 4) | (truthflav == 5) print 'Plotting' plot(bc_selection, 'bc', truthflav, oldMV2, c00, c07, c15)
parser.add_argument( '--sort_by', default='d0z0sig', help='str, name of the variable used to order tracks in an event') parser.add_argument('--ntrk', default=30, type=int, help="Maximum number of tracks per event. \ If the event has fewer tracks, use padding; if is has more, only consider the first ntrk" ) args = parser.parse_args() print 'Loading dataframes...' # -- currently only training and testing on one file each! trk_train = pup.root2panda(os.path.join('data', 'train', args.train_files), 'bTag_AntiKt4EMTopoJets', branches=track_inputs + jet_inputs) trk_test = pup.root2panda(os.path.join('data', 'test', args.test_files), 'bTag_AntiKt4EMTopoJets', branches=track_inputs + jet_inputs) print 'Processing training sample ...' train_dict = process_data(trk_train, jet_inputs, args.ntrk, args.sort_by, args.output, savevars=True) del trk_train io.save(os.path.join('data', 'train_dict_' + args.output + '.h5'), train_dict)
def main(inputfiles, treename, ftrain, max_n_pairs, exclude_list): ''' Args: ----- inputfiles: list of strings with the paths to root files treename: string, name of the TTree that contains the branches ftrain: float in range [0, 1], training fraction max_n_pairs: int, maximum number of jet pairs to consider per event exclude_list: Returns: -------- ''' # -- configure logging utils.configure_logging() logger = logging.getLogger('main') # -- concatenate all files into a pandas df short_filenames = [f.split('/')[-1] for f in inputfiles] logger.info('Creating pandas dataframes from: {}'.format(short_filenames)) #df = pd.concat([pup.root2panda(f, treename) for f in inputfiles], ignore_index=True) df_list = [] for f in inputfiles: df_temp = pup.root2panda(f, treename) df_temp['sample'] = f.split('/')[-1].split('.')[0] df_list.append(df_temp) df = pd.concat(df_list, ignore_index=True) # -- remove events with more than one correct jet pair # -- because that shouldn't happen and complicates the task # -- of finding the correct jet pair logger.info('Removing events with more than one correct jet pair') keep = np.array([sum(yen) for yen in df['isCorrect'].values]) <= 1 df = df[keep].reset_index(drop=True) # -- target logger.info('Building one-hot target') y = df['isCorrect'].values # -- extract array of names of sample of origin sample = df['sample'].values # -- prepend 1 to all entries in y where there is no correct jet pair, # -- 0 if there exists a correct jet pair already # -- each entry in y will now have length (n_jet_pairs + 1) y_long = np.array([ np.insert(yev, 0, 1) if sum(yev) == 0 else np.insert(yev, 0, 0) for yev in y ]) # -- weights logger.info('Extracting weights from event_weight') w = df['event_weight'].values del df['event_weight'], df['isCorrect'], df['sample'] df = df.drop(exclude_list, axis=1) # maybe in the future do something # better with these variables instead of just removing them # -- matrix of predictors X = df.values ix = range(X.shape[0]) varlist = df.columns.values.tolist() # -- maximum number of jet pairs to consider in each event # -- can be set to whatever number makes sense #max_length = max([len(b) for b in df['Delta_eta_jb']]) + 1 max_length = max_n_pairs + 1 logger.info( 'The max number of jet pairs per event will be {}'.format(max_n_pairs)) X_train, X_test, y_train, y_test, w_train, w_test,\ sample_train, sample_test, ix_train, ix_test, scaler_list = shuffle_split_scale_pad( X, y_long, w, sample, ix, ftrain, max_length ) logger.info('Saving processed data as hdf5 in data/') io.save( os.path.join('data', 'train_dict.hdf5'), { 'X': X_train, 'y': y_train, 'w': w_train, 'ix': ix_train, 'vars': varlist, 'sample': sample_train.tolist(), 'scalers': scaler_list }) io.save( os.path.join('data', 'test_dict.hdf5'), { 'X': X_test, 'y': y_test, 'w': w_test, 'ix': ix_test, 'vars': varlist, 'sample': sample_test.tolist(), 'scalers': scaler_list })
'jet_trk_chi2', 'jet_trk_nInnHits', 'jet_trk_nNextToInnHits', 'jet_trk_nBLHits', 'jet_trk_nsharedBLHits', 'jet_trk_nsplitBLHits', 'jet_trk_nPixHits', 'jet_trk_nsharedPixHits', 'jet_trk_nsplitPixHits', 'jet_trk_nSCTHits', 'jet_trk_nsharedSCTHits', 'jet_trk_expectBLayerHit', #'jet_trk_dPhi'] # more to be added in `process_data` 'jet_trk_phi'] # more to be added in `process_data` cut_vars = ['jet_eta', 'jet_pt', 'jet_JVT', 'jet_aliveAfterOR'] # only necessary to remove bad jets # -- load and process training set print 'Loading training dataframe...' trk_train = pup.root2panda( './data/Dan/NOtrkSel/train_NOtrkSel.root', 'bTag_AntiKt4EMTopoJets', branches = track_inputs + cut_vars + ['jet_LabDr_HadF' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta'] ) print 'Processing training sample ...' train_dict = process_data(trk_train, cut_vars, savevars=True) del trk_train io.save('./data/train_dict_IPConv_ntuple_MyTrkSel.h5', train_dict) # -- load and process test set print 'Loading test dataframe...' trk_test = pup.root2panda( './data/Dan/NOtrkSel/test/user.dguest.8493098.Akt4EMTo._000013_NOtrkSel.root', 'bTag_AntiKt4EMTopoJets', branches = track_inputs + cut_vars + ['jet_LabDr_HadF' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta'] ) print 'Processing test sample...'
def main(inputfiles, treename='bTag_AntiKt2PV0TrackJets'): configure_logging() logger = logging.getLogger('ProcessTrackJetData') # -- import root files into df logger.info('Importing ROOT files into pandas dataframes') df = pup.root2panda( inputfiles, treename, branches=[ 'jet_pt', 'jet_eta', 'jet_phi', 'jet_m', 'jet_ip2d_pu', 'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc', 'jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_sig3d', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz', 'jet_aliveAfterOR', 'jet_aliveAfterORmu', 'jet_nConst', 'jet_LabDr_HadF' ]) # -- Insert default values, calculate MV2 variables from the branches in df logger.info('Creating MV2 variables') df = transformVars(df) # -- Flatten from event-flat to jet-flat # -- Before doing so, remove event-level variables such as PVx,y,z logger.info('Flattening dataframe') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()}) # -- apply eta, pt, OR cuts from b-tagging recommendations logger.info('Applying cuts') df_flat = applycuts(df_flat) # -- build X, y, w # -- target values y = df_flat['jet_LabDr_HadF'].values # -- slice df by only keeping the 24 variables for MV2 training training_vars = [ 'jet_pt', 'abs(jet_eta)', 'jet_ip2', 'jet_ip2_c', 'jet_ip2_cu', 'jet_ip3', 'jet_ip3_c', 'jet_ip3_cu', 'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_Lxy', 'jet_sv1_L3d', 'jet_sv1_sig3d', 'jet_sv1_dR', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_dR', 'jet_jf_sig3d' ] X = df_flat[training_vars].as_matrix() logger.info( '2D pT and eta reweighting of charm and light to bottom distribution') w = reweight_to_b(X, y) X, y, w = remove_tau(X, y, w) # -- turn classes 0, 4, 5, 15 to 0, 1, 2, 3 # le = LabelEncoder() # y = le.fit_transform(y) # -- randomly shuffle and split into train and test set logger.info('Shuffling and splitting') X_train, X_test, y_train, y_test, w_train, w_test = train_test_split( X, y, w, train_size=0.6) # -- save out to hdf5 logger.info('Saving data to hdf5') io.save(open('train_data.h5', 'wb'), { 'X': X_train, 'y': y_train, 'w': w_train }) io.save(open('test_data.h5', 'wb'), { 'X': X_test, 'y': y_test, 'w': w_test })
def process(i, filepath, yaml_file, model_id): ''' ''' import pandautils as pup # -- load branches from yaml file branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features(yaml_file) logger = logging.getLogger("ETL Service") # -- load root file to dataframe logger.info('Operating on {}'.format(filepath)) logger.info('Creating dataframe...') df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches) # -- create MV2 input quantities, set default values logger.info('Transforming variables...') df = transformVars(df) # -- flatten to jet-flat structure logger.info('Flattening df...') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()}) del df # --apply standard cuts on AntiKT4EMTopoJets logger.info('Applying cuts...') df_flat = apply_calojet_cuts(df_flat) # -- create numpy arrays for ML logger.info('Creating X, y, w, mv2c10...') y = df_flat['jet_LabDr_HadF'].values mv2c10 = df_flat['jet_mv2c10'].values jet_pt = df_flat['jet_pt'].values ip3d_vars = df_flat[ip3d_training_vars].values ipmp_vars = df_flat[ipmp_training_vars].values # -- slice df by only keeping the training variables X = df_flat[training_vars].values # -- Find weights by reweighting to the light distribution pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values w = reweight_to_l(pteta, y, pt_col=0, eta_col=1) del df_flat, pteta # -- shuffle data, split into train and test logger.info('Shuffling, splitting, scaling...') ix = np.array(range(len(y))) X_train, X_test,\ y_train, y_test,\ w_train, w_test,\ ix_train, ix_test, \ mv2c10_train, mv2c10_test,\ jet_pt_train, jet_pt_test,\ ip3d_vars_train, ip3d_vars_test,\ ipmp_vars_train, ipmp_vars_test = train_test_split( X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6 ) # -- scale inputs to 0 mean, 1 std scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ip3d_vars_train = scaler.fit_transform(ip3d_vars_train) ip3d_vars_test = scaler.transform(ip3d_vars_test) ipmp_vars_train = scaler.fit_transform(ipmp_vars_train) ipmp_vars_test = scaler.transform(ipmp_vars_test) # -- split the previously selected training data into train and validate X_train, X_validate,\ y_train, y_validate,\ w_train, w_validate,\ ix_train, ix_validate,\ mv2c10_train, mv2c10_validate,\ jet_pt_train, jet_pt_validate,\ ip3d_vars_train, ip3d_vars_validate,\ ipmp_vars_train, ipmp_vars_validate = train_test_split( X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7 ) # -- assign train, test, validate data to dictionaries train = { 'X' : X_train, 'ip3d_vars': ip3d_vars_train, 'ipmp_vars': ipmp_vars_train, 'y' : y_train, 'w' : w_train, 'ix': ix_train, 'mv2c10': mv2c10_train, 'pt': jet_pt_train } test = { 'X' : X_test, 'ip3d_vars': ip3d_vars_test, 'ipmp_vars': ipmp_vars_test, 'y' : y_test, 'w' : w_test, 'ix': ix_test, 'mv2c10': mv2c10_test, 'pt': jet_pt_test } validate = { 'X' : X_validate, 'ip3d_vars': ip3d_vars_validate, 'ipmp_vars': ipmp_vars_validate, 'y' : y_validate, 'w' : w_validate, 'ix': ix_validate, 'mv2c10': mv2c10_validate, 'pt': jet_pt_validate } # -- save dictionaries to hdf5 logger.info('Saving dictionaries to hdf5...') hdf5_train_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-train-db.h5') hdf5_test_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-test-db.h5') hdf5_validate_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-validate-db.h5') io.save(hdf5_train_path, train) io.save(hdf5_test_path, test) io.save(hdf5_validate_path, validate) logger.debug('Saved hdf5 archives: {}, {}, {}'. format(hdf5_train_path, hdf5_test_path, hdf5_validate_path)) return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
def main(iptagger, root_paths, model_id): configure_logging() logger = logging.getLogger("Combine_MV2IP") logger.info("Running on: {}".format(iptagger)) branches, training_vars = set_features(iptagger) logger.info('Creating dataframe...') df = pup.root2panda('../data/final_production/*', 'bTag_AntiKt4EMTopoJets', branches=branches) logger.info('Transforming variables...') df = transformVars(df, iptagger) logger.info('Flattening df...') df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()}) del df logger.info('Applying cuts...') df_flat = apply_calojet_cuts(df_flat) logger.info('Will train on {}'.format(training_vars)) logger.info('Creating X, y, w, mv2c10...') y = df_flat['jet_LabDr_HadF'].values mv2c10 = df_flat['jet_mv2c10'].values # -- slice df by only keeping the training variables X = df_flat[training_vars].values pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values #w = reweight_to_b(pteta, y, pt_col=0, eta_col=1) w = reweight_to_l(pteta, y, pt_col=0, eta_col=1) del df_flat, pteta logger.info('Shuffling, splitting, scaling...') ix = np.array(range(len(y))) X_train, X_test, y_train, y_test, w_train, w_test, \ ix_train, ix_test, mv2c10_train, mv2c10_test = train_test_split( X, y, w, ix, mv2c10, train_size=0.6 ) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) le = LabelEncoder() net = Sequential() net.add(Dense(16, input_shape=(X_train.shape[1], ), activation='relu')) net.add(Dropout(0.2)) net.add(Dense(4, activation='softmax')) net.summary() net.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy']) weights_path = iptagger + '-' + model_id + '-progress.h5' try: logger.info('Trying to load weights from ' + weights_path) net.load_weights(weights_path) logger.info('Weights found and loaded from ' + weights_path) except IOError: logger.info('No weights found in ' + weights_path) # -- train try: net.fit(X_train, le.fit_transform(y_train), verbose=True, batch_size=64, sample_weight=w_train, callbacks=[ EarlyStopping(verbose=True, patience=100, monitor='val_loss'), ModelCheckpoint(weights_path, monitor='val_loss', verbose=True, save_best_only=True) ], nb_epoch=200, validation_split=0.3) except KeyboardInterrupt: print '\n Stopping early.' # -- load in best network net.load_weights(weights_path) # -- test print 'Testing...' yhat = net.predict(X_test, verbose=True) # -- save the predicions to numpy file np.save('yhat-{}-{}.npy'.format(iptagger, model_id), yhat) test = {'X': X_test, 'y': y_test, 'w': w_test, 'mv2c10': mv2c10_test} # -- plot performance performance(yhat, test, iptagger)
def main(weights, picklename, filename, treename='bTag_AntiKt2PV0TrackJets'): ''' evaluate the tmva method after transforming input data into right format Args: ----- weights: .xml file out of mv2 training containing bdt parameters picklename: name of the output pickle to store new mv2 values filename: .root file with ntuples used to evaluate the tmva method treename: (optional) name of the TTree to consider Returns: -------- status Raises: ------- nothing yet, but to be improved ''' print 'Parsing XML file...' # -- Load XML file tree = ET.parse(weights) root = tree.getroot() # -- Get list of variable names from XML file var_list = [ var.attrib['Label'] for var in root.findall('Variables')[0].findall('Variable') ] # -- Count the input variables that go into MV2: n_vars = len(var_list) print 'Loading .root file for evaluation...' # -- Get ntuples: df = pup.root2panda( filename, treename, branches=[ 'jet_pt', 'jet_eta', 'jet_phi', 'jet_m', 'jet_ip2d_pu', 'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc', 'jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_sig3d', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz' ]) # -- Insert default values, calculate MV2 variables from the branches in df df = transformVars(df) # -- Map ntuple names to var_list names_mapping = { 'pt': 'jet_pt', 'abs(eta)': 'abs(jet_eta)', 'ip2': 'jet_ip2', 'ip2_c': 'jet_ip2_c', 'ip2_cu': 'jet_ip2_cu', 'ip3': 'jet_ip3', 'ip3_c': 'jet_ip3_c', 'ip3_cu': 'jet_ip3_cu', 'sv1_ntkv': 'jet_sv1_ntrkv', 'sv1_mass': 'jet_sv1_m', 'sv1_efrc': 'jet_sv1_efc', 'sv1_n2t': 'jet_sv1_n2t', 'sv1_Lxy': 'jet_sv1_Lxy', 'sv1_L3d': 'jet_sv1_L3d', 'sv1_sig3': 'jet_sv1_sig3d', 'sv1_dR': 'jet_sv1_dR', 'jf_n2tv': 'jet_jf_n2t', 'jf_ntrkv': 'jet_jf_ntrkAtVx', 'jf_nvtx': 'jet_jf_nvtx', 'jf_nvtx1t': 'jet_jf_nvtx1t', 'jf_mass': 'jet_jf_m', 'jf_efrc': 'jet_jf_efc', 'jf_dR': 'jet_jf_dR', 'jf_sig3': 'jet_jf_sig3d' } print 'Initializing TMVA...' # -- TMVA: Initialize reader, add empty variables and weights from training reader = TMVA.Reader() for n in range(n_vars): reader.AddVariable(var_list[n], array('f', [0])) reader.BookMVA('BDTG akt2', weights) print 'Creating feature matrix...' # -- Get features for each event and store them in X_test X_buf = [] for event in df[[names_mapping[var] for var in var_list]].values: X_buf.extend( np.array([normalize_type(jet) for jet in event]).T.tolist()) X_test = np.array(X_buf) print 'Evaluating!' # -- TMVA: Evaluate! twoclass_output = evaluate_reader(reader, 'BDTG akt2', X_test) # -- Reshape the MV2 output into event-jet format reorganized = match_shape(twoclass_output, df['jet_pt']) import cPickle print 'Saving new MV2 weights in {}'.format(picklename) cPickle.dump(reorganized, open(picklename, 'wb')) # -- Write the new branch to the tree (currently de-activated) #add_branch(reorganized, filename, treename, 'jet_mv2c20_new') print 'Done. Success!' return 0
def main(iptagger, root_paths, model_id): configure_logging() logger = logging.getLogger("Combine_MV2IP") logger.info("Running on: {}".format(iptagger)) branches, training_vars = set_features(iptagger) logger.info('Creating dataframe...') df = pup.root2panda('../data/final_production/*', 'bTag_AntiKt4EMTopoJets', branches = branches) logger.info('Transforming variables...') df = transformVars(df, iptagger) logger.info('Flattening df...') df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()}) del df logger.info('Applying cuts...') df_flat = apply_calojet_cuts(df_flat) logger.info('Will train on {}'. format(training_vars)) logger.info('Creating X, y, w, mv2c10...') y = df_flat['jet_LabDr_HadF'].values mv2c10 = df_flat['jet_mv2c10'].values # -- slice df by only keeping the training variables X = df_flat[training_vars].values pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values #w = reweight_to_b(pteta, y, pt_col=0, eta_col=1) w = reweight_to_l(pteta, y, pt_col=0, eta_col=1) del df_flat, pteta logger.info('Shuffling, splitting, scaling...') ix = np.array(range(len(y))) X_train, X_test, y_train, y_test, w_train, w_test, \ ix_train, ix_test, mv2c10_train, mv2c10_test = train_test_split( X, y, w, ix, mv2c10, train_size=0.6 ) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) le = LabelEncoder() net = Sequential() net.add(Dense(16, input_shape=(X_train.shape[1], ), activation='relu')) net.add(Dropout(0.2)) net.add(Dense(4, activation='softmax')) net.summary() net.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy']) weights_path = iptagger + '-' + model_id + '-progress.h5' try: logger.info('Trying to load weights from ' + weights_path) net.load_weights(weights_path) logger.info('Weights found and loaded from ' + weights_path) except IOError: logger.info('No weights found in ' + weights_path) # -- train try: net.fit( X_train, le.fit_transform(y_train), verbose=True, batch_size=64, sample_weight=w_train, callbacks = [ EarlyStopping(verbose=True, patience=100, monitor='val_loss'), ModelCheckpoint(weights_path, monitor='val_loss', verbose=True, save_best_only=True) ], nb_epoch=200, validation_split=0.3 ) except KeyboardInterrupt: print '\n Stopping early.' # -- load in best network net.load_weights(weights_path) # -- test print 'Testing...' yhat = net.predict(X_test, verbose=True) # -- save the predicions to numpy file np.save('yhat-{}-{}.npy'.format(iptagger, model_id), yhat) test = { 'X' : X_test, 'y' : y_test, 'w' : w_test, 'mv2c10' : mv2c10_test } # -- plot performance performance(yhat, test, iptagger)
default=30, type=int, help="Maximum number of tracks per event. \ If the event has fewer tracks, use padding; if is has more, only consider the first ntrk") parser.add_argument('--inputs', default='grade', help='one of: hits, grade') args = parser.parse_args() track_inputs, jet_inputs = generate_inputlist(args.inputs) print 'Loading dataframes...' # -- currently only training and testing on one file each! trk_train = pup.root2panda( os.path.join('data', 'train', args.train_files), 'bTag_AntiKt4EMTopoJets', branches = track_inputs + jet_inputs ) trk_test = pup.root2panda( os.path.join('data', 'test', args.test_files), 'bTag_AntiKt4EMTopoJets', branches = track_inputs + jet_inputs ) print 'Processing training sample ...' train_dict = process_data(trk_train, jet_inputs, args.ntrk, args.sort_by, args.output, args.inputs, savevars=True) del trk_train io.save(os.path.join('data', 'train_dict_' + args.output + '.h5'), train_dict) print 'Processing test sample...' test_dict = process_data(trk_test, jet_inputs, args.ntrk, args.sort_by, args.output, args.inputs,) del trk_test
track_inputs = [ 'jet_trk_pt', 'jet_trk_d0', 'jet_trk_z0', 'jet_trk_d0sig', 'jet_trk_z0sig', 'jet_trk_chi2', 'jet_trk_nInnHits', 'jet_trk_nNextToInnHits', 'jet_trk_nBLHits', 'jet_trk_nsharedBLHits', 'jet_trk_nsplitBLHits', 'jet_trk_nPixHits', 'jet_trk_nsharedPixHits', 'jet_trk_nsplitPixHits', 'jet_trk_nSCTHits', 'jet_trk_nsharedSCTHits', 'jet_trk_expectBLayerHit' ] # 2 more to be added in `process_data` print 'Loading dataframes...' # -- currently only training and testing on one file each! trk_train = pup.root2panda( './data/train/*410000_00*.root', 'JetCollection', branches=track_inputs + [ 'jet_truthflav', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi' ]) trk_test = pup.root2panda( './data/test/*410000*.root', 'JetCollection', branches=track_inputs + [ 'jet_truthflav', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi' ]) print 'Processing training sample ...' train_dict = process_data(trk_train, savevars=True) del trk_train
def main(weights, picklename, filename, treename = 'bTag_AntiKt2PV0TrackJets'): ''' evaluate the tmva method after transforming input data into right format Args: ----- weights: .xml file out of mv2 training containing bdt parameters picklename: name of the output pickle to store new mv2 values filename: .root file with ntuples used to evaluate the tmva method treename: (optional) name of the TTree to consider Returns: -------- status Raises: ------- nothing yet, but to be improved ''' print 'Parsing XML file...' # -- Load XML file tree = ET.parse(weights) root = tree.getroot() # -- Get list of variable names from XML file var_list = [var.attrib['Label'] for var in root.findall('Variables')[0].findall('Variable')] # -- Count the input variables that go into MV2: n_vars = len(var_list) print 'Loading .root file for evaluation...' # -- Get ntuples: df = pup.root2panda(filename, treename, branches = ['jet_pt', 'jet_eta','jet_phi', 'jet_m', 'jet_ip2d_pu', 'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc','jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv', 'jet_sv1_m','jet_sv1_efc','jet_sv1_n2t','jet_sv1_sig3d', 'jet_jf_n2t','jet_jf_ntrkAtVx','jet_jf_nvtx','jet_jf_nvtx1t','jet_jf_m', 'jet_jf_efc','jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz' ]) # -- Insert default values, calculate MV2 variables from the branches in df df = transformVars(df) # -- Map ntuple names to var_list names_mapping = { 'pt':'jet_pt', 'abs(eta)':'abs(jet_eta)', 'ip2':'jet_ip2', 'ip2_c':'jet_ip2_c', 'ip2_cu':'jet_ip2_cu', 'ip3':'jet_ip3', 'ip3_c':'jet_ip3_c', 'ip3_cu':'jet_ip3_cu', 'sv1_ntkv':'jet_sv1_ntrkv', 'sv1_mass':'jet_sv1_m', 'sv1_efrc':'jet_sv1_efc', 'sv1_n2t':'jet_sv1_n2t', 'sv1_Lxy':'jet_sv1_Lxy', 'sv1_L3d':'jet_sv1_L3d', 'sv1_sig3':'jet_sv1_sig3d', 'sv1_dR': 'jet_sv1_dR', 'jf_n2tv':'jet_jf_n2t', 'jf_ntrkv':'jet_jf_ntrkAtVx', 'jf_nvtx':'jet_jf_nvtx', 'jf_nvtx1t':'jet_jf_nvtx1t', 'jf_mass':'jet_jf_m', 'jf_efrc':'jet_jf_efc', 'jf_dR':'jet_jf_dR', 'jf_sig3':'jet_jf_sig3d' } print 'Initializing TMVA...' # -- TMVA: Initialize reader, add empty variables and weights from training reader = TMVA.Reader() for n in range(n_vars): reader.AddVariable(var_list[n], array('f', [0] ) ) reader.BookMVA('BDTG akt2', weights) print 'Creating feature matrix...' # -- Get features for each event and store them in X_test X_buf = [] for event in df[[names_mapping[var] for var in var_list]].values: X_buf.extend(np.array([normalize_type(jet) for jet in event]).T.tolist()) X_test = np.array(X_buf) print 'Evaluating!' # -- TMVA: Evaluate! twoclass_output = evaluate_reader(reader, 'BDTG akt2', X_test) # -- Reshape the MV2 output into event-jet format reorganized = match_shape(twoclass_output, df['jet_pt']) import cPickle print 'Saving new MV2 weights in {}'.format(picklename) cPickle.dump(reorganized, open(picklename, 'wb')) # -- Write the new branch to the tree (currently de-activated) #add_branch(reorganized, filename, treename, 'jet_mv2c20_new') print 'Done. Success!' return 0
def process(i, filepath, yaml_file, model_id): ''' ''' import pandautils as pup # -- load branches from yaml file branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features( yaml_file) logger = logging.getLogger("ETL Service") # -- load root file to dataframe logger.info('Operating on {}'.format(filepath)) logger.info('Creating dataframe...') df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches) # -- create MV2 input quantities, set default values logger.info('Transforming variables...') df = transformVars(df) # -- flatten to jet-flat structure logger.info('Flattening df...') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()}) del df # --apply standard cuts on AntiKT4EMTopoJets logger.info('Applying cuts...') df_flat = apply_calojet_cuts(df_flat) # -- create numpy arrays for ML logger.info('Creating X, y, w, mv2c10...') y = df_flat['jet_LabDr_HadF'].values mv2c10 = df_flat['jet_mv2c10'].values jet_pt = df_flat['jet_pt'].values ip3d_vars = df_flat[ip3d_training_vars].values ipmp_vars = df_flat[ipmp_training_vars].values # -- slice df by only keeping the training variables X = df_flat[training_vars].values # -- Find weights by reweighting to the light distribution pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values w = reweight_to_l(pteta, y, pt_col=0, eta_col=1) del df_flat, pteta # -- shuffle data, split into train and test logger.info('Shuffling, splitting, scaling...') ix = np.array(range(len(y))) X_train, X_test,\ y_train, y_test,\ w_train, w_test,\ ix_train, ix_test, \ mv2c10_train, mv2c10_test,\ jet_pt_train, jet_pt_test,\ ip3d_vars_train, ip3d_vars_test,\ ipmp_vars_train, ipmp_vars_test = train_test_split( X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6 ) # -- scale inputs to 0 mean, 1 std scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ip3d_vars_train = scaler.fit_transform(ip3d_vars_train) ip3d_vars_test = scaler.transform(ip3d_vars_test) ipmp_vars_train = scaler.fit_transform(ipmp_vars_train) ipmp_vars_test = scaler.transform(ipmp_vars_test) # -- split the previously selected training data into train and validate X_train, X_validate,\ y_train, y_validate,\ w_train, w_validate,\ ix_train, ix_validate,\ mv2c10_train, mv2c10_validate,\ jet_pt_train, jet_pt_validate,\ ip3d_vars_train, ip3d_vars_validate,\ ipmp_vars_train, ipmp_vars_validate = train_test_split( X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7 ) # -- assign train, test, validate data to dictionaries train = { 'X': X_train, 'ip3d_vars': ip3d_vars_train, 'ipmp_vars': ipmp_vars_train, 'y': y_train, 'w': w_train, 'ix': ix_train, 'mv2c10': mv2c10_train, 'pt': jet_pt_train } test = { 'X': X_test, 'ip3d_vars': ip3d_vars_test, 'ipmp_vars': ipmp_vars_test, 'y': y_test, 'w': w_test, 'ix': ix_test, 'mv2c10': mv2c10_test, 'pt': jet_pt_test } validate = { 'X': X_validate, 'ip3d_vars': ip3d_vars_validate, 'ipmp_vars': ipmp_vars_validate, 'y': y_validate, 'w': w_validate, 'ix': ix_validate, 'mv2c10': mv2c10_validate, 'pt': jet_pt_validate } # -- save dictionaries to hdf5 logger.info('Saving dictionaries to hdf5...') hdf5_train_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) + '-train-db.h5') hdf5_test_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) + '-test-db.h5') hdf5_validate_path = os.path.join( '..', 'data', 'DL1-' + model_id + str(i) + '-validate-db.h5') io.save(hdf5_train_path, train) io.save(hdf5_test_path, test) io.save(hdf5_validate_path, validate) logger.debug('Saved hdf5 archives: {}, {}, {}'.format( hdf5_train_path, hdf5_test_path, hdf5_validate_path)) return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
def process(i, filepath, yaml_file): ''' ''' import pandautils as pup branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features( yaml_file) logger = logging.getLogger("ETL Service") logger.info('Operating on {}'.format(filepath)) logger.info('Creating dataframe...') df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches) logger.info('Transforming variables...') df = transformVars(df) logger.info('Flattening df...') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()}) del df logger.info('Applying cuts...') df_flat = apply_calojet_cuts(df_flat) logger.info('Creating X, y, w, mv2c10...') y = df_flat['jet_LabDr_HadF'].values mv2c10 = df_flat['jet_mv2c10'].values jet_pt = df_flat['jet_pt'].values ip3d_vars = df_flat[ip3d_training_vars].values ipmp_vars = df_flat[ipmp_training_vars].values # -- slice df by only keeping the training variables X = df_flat[training_vars].values # -- Find weights by reweighting to the light distribution # -- TO DO: pass the pt and eta columns directly, instead of passing their indices pt_col = np.argwhere(np.array(training_vars) == 'jet_pt')[0][0] eta_col = np.argwhere(np.array(training_vars) == 'abs(jet_eta)')[0][0] #w = reweight_to_b(X, y, pt_col, eta_col) w = reweight_to_l(X, y, pt_col, eta_col) del df_flat logger.info('Shuffling, splitting, scaling...') ix = np.array(range(len(y))) X_train, X_test, y_train, y_test, w_train, w_test, ix_train, ix_test, \ mv2c10_train, mv2c10_test, jet_pt_train, jet_pt_test, \ ip3d_vars_train, ip3d_vars_test, ipmp_vars_train, ipmp_vars_test = train_test_split( X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ip3d_vars_train = scaler.fit_transform(ip3d_vars_train) ip3d_vars_test = scaler.transform(ip3d_vars_test) ipmp_vars_train = scaler.fit_transform(ipmp_vars_train) ipmp_vars_test = scaler.transform(ipmp_vars_test) X_train, X_validate, y_train, y_validate, w_train, w_validate, ix_train, ix_validate, \ mv2c10_train, mv2c10_validate, jet_pt_train, jet_pt_validate, \ ip3d_vars_train, ip3d_vars_validate, ipmp_vars_train, ipmp_vars_validate = train_test_split( X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7) train = { 'X': X_train, 'ip3d_vars': ip3d_vars_train, 'ipmp_vars': ipmp_vars_train, 'y': y_train, 'w': w_train, 'ix': ix_train, 'mv2c10': mv2c10_train, 'pt': jet_pt_train } test = { 'X': X_test, 'ip3d_vars': ip3d_vars_test, 'ipmp_vars': ipmp_vars_test, 'y': y_test, 'w': w_test, 'ix': ix_test, 'mv2c10': mv2c10_test, 'pt': jet_pt_test } validate = { 'X': X_validate, 'ip3d_vars': ip3d_vars_validate, 'ipmp_vars': ipmp_vars_validate, 'y': y_validate, 'w': w_validate, 'ix': ix_validate, 'mv2c10': mv2c10_validate, 'pt': jet_pt_validate } logger.info('Saving dictionaries to hdf5...') hdf5_train_path = os.path.join('..', 'data', 'DL1-' + OUTNAME + str(i) + '-train-db.h5') hdf5_test_path = os.path.join('..', 'data', 'DL1-' + OUTNAME + str(i) + '-test-db.h5') hdf5_validate_path = os.path.join( '..', 'data', 'DL1-' + OUTNAME + str(i) + '-validate-db.h5') io.save(hdf5_train_path, train) io.save(hdf5_test_path, test) io.save(hdf5_validate_path, validate) logger.debug('Saved hdf5 archives: {}, {}, {}'.format( hdf5_train_path, hdf5_test_path, hdf5_validate_path)) return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
if __name__ == '__main__': track_inputs = ['jet_trk_pt', 'jet_trk_d0', 'jet_trk_z0', 'jet_trk_d0sig', 'jet_trk_z0sig', 'jet_trk_chi2', 'jet_trk_nInnHits', 'jet_trk_nNextToInnHits', 'jet_trk_nBLHits', 'jet_trk_nsharedBLHits', 'jet_trk_nsplitBLHits', 'jet_trk_nPixHits', 'jet_trk_nsharedPixHits', 'jet_trk_nsplitPixHits', 'jet_trk_nSCTHits', 'jet_trk_nsharedSCTHits', 'jet_trk_expectBLayerHit'] # 2 more to be added in `process_data` print 'Loading dataframes...' # -- currently only training and testing on one file each! trk_train = pup.root2panda( './data/train/*410000_00*.root', 'JetCollection', branches = track_inputs + ['jet_truthflav' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi'] ) trk_test = pup.root2panda( './data/test/*410000*.root', 'JetCollection', branches = track_inputs + ['jet_truthflav' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi'] ) print 'Processing training sample ...' train_dict = process_data(trk_train, savevars=True) del trk_train io.save('./data/train_dict_IPConv.h5', train_dict) print 'Processing test sample...'
def main(inputfiles, treename='bTag_AntiKt2PV0TrackJets'): configure_logging() logger = logging.getLogger('ProcessTrackJetData') # -- import root files into df logger.info('Importing ROOT files into pandas dataframes') df = pup.root2panda(inputfiles, treename, branches = [ 'jet_pt', 'jet_eta','jet_phi', 'jet_m', 'jet_ip2d_pu', 'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc','jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv', 'jet_sv1_m','jet_sv1_efc','jet_sv1_n2t','jet_sv1_sig3d', 'jet_jf_n2t','jet_jf_ntrkAtVx','jet_jf_nvtx','jet_jf_nvtx1t','jet_jf_m', 'jet_jf_efc','jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz', 'jet_aliveAfterOR', 'jet_aliveAfterORmu', 'jet_nConst', 'jet_LabDr_HadF']) # -- Insert default values, calculate MV2 variables from the branches in df logger.info('Creating MV2 variables') df = transformVars(df) # -- Flatten from event-flat to jet-flat # -- Before doing so, remove event-level variables such as PVx,y,z logger.info('Flattening dataframe') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()}) # -- apply eta, pt, OR cuts from b-tagging recommendations logger.info('Applying cuts') df_flat = applycuts(df_flat) # -- build X, y, w # -- target values y = df_flat['jet_LabDr_HadF'].values # -- slice df by only keeping the 24 variables for MV2 training training_vars = [ 'jet_pt', 'abs(jet_eta)', 'jet_ip2', 'jet_ip2_c', 'jet_ip2_cu', 'jet_ip3', 'jet_ip3_c', 'jet_ip3_cu', 'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_Lxy', 'jet_sv1_L3d', 'jet_sv1_sig3d', 'jet_sv1_dR', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_dR', 'jet_jf_sig3d'] X = df_flat[training_vars].as_matrix() logger.info('2D pT and eta reweighting of charm and light to bottom distribution') w = reweight_to_b(X, y) X, y, w = remove_tau(X, y, w) # -- turn classes 0, 4, 5, 15 to 0, 1, 2, 3 # le = LabelEncoder() # y = le.fit_transform(y) # -- randomly shuffle and split into train and test set logger.info('Shuffling and splitting') X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, train_size = 0.6) # -- save out to hdf5 logger.info('Saving data to hdf5') io.save(open('train_data.h5', 'wb'), {'X' : X_train, 'y' : y_train, 'w' : w_train}) io.save(open('test_data.h5', 'wb'), {'X' : X_test, 'y' : y_test, 'w' : w_test})
'jet_trk_nSCTHits', 'jet_trk_nsharedSCTHits', 'jet_trk_expectBLayerHit', #'jet_trk_dPhi'] # more to be added in `process_data` 'jet_trk_phi' ] # more to be added in `process_data` cut_vars = ['jet_eta', 'jet_pt', 'jet_JVT', 'jet_aliveAfterOR'] # only necessary to remove bad jets # -- load and process training set print 'Loading training dataframe...' trk_train = pup.root2panda( './data/Dan/NOtrkSel/train_NOtrkSel.root', 'bTag_AntiKt4EMTopoJets', branches=track_inputs + cut_vars + [ 'jet_LabDr_HadF', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta' ]) print 'Processing training sample ...' train_dict = process_data(trk_train, cut_vars, savevars=True) del trk_train io.save('./data/train_dict_IPConv_ntuple_MyTrkSel.h5', train_dict) # -- load and process test set print 'Loading test dataframe...' trk_test = pup.root2panda( './data/Dan/NOtrkSel/test/user.dguest.8493098.Akt4EMTo._000013_NOtrkSel.root', 'bTag_AntiKt4EMTopoJets', branches=track_inputs + cut_vars + [ 'jet_LabDr_HadF', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc',