def _make_df(val, key, branches):
     df = pup.root2panda(val, tree_name, branches = branches + ['HGamEventInfoAuxDyn.yybb_weight'])
     if mode == 'classification':
         df['y'] = key
     elif mode == 'regression':
         if key == 'bkg':
             df['y'] = 0
         else:
             df['y'] = int(key[1:])
     return df
def plotROC(test_ntuple_path):#, picklename):
	'''
	Definition:
	-----------
		Plot a ROC curve comparison between the old mv2c10 contained in the branch and the newly evaluated one,
		which is loaded in from a pickle file.
		Both the .root file and the pickled mv2 array are assumed to be event-flat, not jet-flat.

	Args:
	-----
		test_ntuple_path: string, the path to the root files used for evaluation
		picklename: string, the path to the pickle containing the new output of your retrained mv2
	'''

	# -- import the root file into a df
	print 'Opening files'
	df = pup.root2panda(test_ntuple_path, 'bTag_AntiKt2PV0TrackJets', branches=['jet_mv2c10', 'jet_LabDr_HadF'])
	# -- extract the old mv2c10 branch for comparison
	oldMV2 = pup.flatten(df['jet_mv2c10'])
	# -- extract the truth labels
	truthflav = pup.flatten(df['jet_LabDr_HadF'])

	# -- open the pickle produced by evaluate_and_store
	print 'Importing pickle'
	c00 = pup.flatten(cPickle.load(open('val_Alessandro_c00.pkl', 'rb')))
	c07 = pup.flatten(cPickle.load(open('val_Alessandro_c07.pkl', 'rb')))
	c15 = pup.flatten(cPickle.load(open('val_Alessandro_c15.pkl', 'rb')))



	# -- this allows you to check performance on b VS light
	# -- change it, if you want to look at a different performance
	print 'Slicing'
	bl_selection = (truthflav == 0) | (truthflav == 5)
	print 'Plotting'
	plot(bl_selection, 'bl', truthflav, oldMV2, c00, c07, c15)

	print 'Slicing'
	bc_selection = (truthflav == 4) | (truthflav == 5)
	print 'Plotting'
	plot(bc_selection, 'bc', truthflav, oldMV2, c00, c07, c15)
Beispiel #3
0
def plotROC(test_ntuple_path):  #, picklename):
    '''
	Definition:
	-----------
		Plot a ROC curve comparison between the old mv2c10 contained in the branch and the newly evaluated one,
		which is loaded in from a pickle file.
		Both the .root file and the pickled mv2 array are assumed to be event-flat, not jet-flat.

	Args:
	-----
		test_ntuple_path: string, the path to the root files used for evaluation
		picklename: string, the path to the pickle containing the new output of your retrained mv2
	'''

    # -- import the root file into a df
    print 'Opening files'
    df = pup.root2panda(test_ntuple_path,
                        'bTag_AntiKt2PV0TrackJets',
                        branches=['jet_mv2c10', 'jet_LabDr_HadF'])
    # -- extract the old mv2c10 branch for comparison
    oldMV2 = pup.flatten(df['jet_mv2c10'])
    # -- extract the truth labels
    truthflav = pup.flatten(df['jet_LabDr_HadF'])

    # -- open the pickle produced by evaluate_and_store
    print 'Importing pickle'
    c00 = pup.flatten(cPickle.load(open('val_Alessandro_c00.pkl', 'rb')))
    c07 = pup.flatten(cPickle.load(open('val_Alessandro_c07.pkl', 'rb')))
    c15 = pup.flatten(cPickle.load(open('val_Alessandro_c15.pkl', 'rb')))

    # -- this allows you to check performance on b VS light
    # -- change it, if you want to look at a different performance
    print 'Slicing'
    bl_selection = (truthflav == 0) | (truthflav == 5)
    print 'Plotting'
    plot(bl_selection, 'bl', truthflav, oldMV2, c00, c07, c15)

    print 'Slicing'
    bc_selection = (truthflav == 4) | (truthflav == 5)
    print 'Plotting'
    plot(bc_selection, 'bc', truthflav, oldMV2, c00, c07, c15)
    parser.add_argument(
        '--sort_by',
        default='d0z0sig',
        help='str, name of the variable used to order tracks in an event')
    parser.add_argument('--ntrk',
                        default=30,
                        type=int,
                        help="Maximum number of tracks per event. \
        If the event has fewer tracks, use padding; if is has more, only consider the first ntrk"
                        )
    args = parser.parse_args()

    print 'Loading dataframes...'
    # -- currently only training and testing on one file each!
    trk_train = pup.root2panda(os.path.join('data', 'train', args.train_files),
                               'bTag_AntiKt4EMTopoJets',
                               branches=track_inputs + jet_inputs)
    trk_test = pup.root2panda(os.path.join('data', 'test', args.test_files),
                              'bTag_AntiKt4EMTopoJets',
                              branches=track_inputs + jet_inputs)
    print 'Processing training sample ...'
    train_dict = process_data(trk_train,
                              jet_inputs,
                              args.ntrk,
                              args.sort_by,
                              args.output,
                              savevars=True)
    del trk_train
    io.save(os.path.join('data', 'train_dict_' + args.output + '.h5'),
            train_dict)
def main(inputfiles, treename, ftrain, max_n_pairs, exclude_list):
    '''
    Args:
    -----
        inputfiles: list of strings with the paths to root files
        treename: string, name of the TTree that contains the branches
        ftrain: float in range [0, 1], training fraction
        max_n_pairs: int, maximum number of jet pairs to consider per event
        exclude_list: 
    Returns:
    --------
    '''
    # -- configure logging
    utils.configure_logging()
    logger = logging.getLogger('main')

    # -- concatenate all files into a pandas df
    short_filenames = [f.split('/')[-1] for f in inputfiles]
    logger.info('Creating pandas dataframes from: {}'.format(short_filenames))
    #df = pd.concat([pup.root2panda(f, treename) for f in inputfiles], ignore_index=True)
    df_list = []
    for f in inputfiles:
        df_temp = pup.root2panda(f, treename)
        df_temp['sample'] = f.split('/')[-1].split('.')[0]
        df_list.append(df_temp)
    df = pd.concat(df_list, ignore_index=True)

    # -- remove events with more than one correct jet pair
    # -- because that shouldn't happen and complicates the task
    # -- of finding the correct jet pair
    logger.info('Removing events with more than one correct jet pair')
    keep = np.array([sum(yen) for yen in df['isCorrect'].values]) <= 1
    df = df[keep].reset_index(drop=True)

    # -- target
    logger.info('Building one-hot target')
    y = df['isCorrect'].values

    # -- extract array of names of sample of origin
    sample = df['sample'].values

    # -- prepend 1 to all entries in y where there is no correct jet pair,
    # -- 0 if there exists a correct jet pair already
    # -- each entry in y will now have length (n_jet_pairs + 1)
    y_long = np.array([
        np.insert(yev, 0, 1) if sum(yev) == 0 else np.insert(yev, 0, 0)
        for yev in y
    ])

    # -- weights
    logger.info('Extracting weights from event_weight')
    w = df['event_weight'].values
    del df['event_weight'], df['isCorrect'], df['sample']
    df = df.drop(exclude_list, axis=1)  # maybe in the future do something
    # better with these variables instead of just removing them

    # -- matrix of predictors
    X = df.values
    ix = range(X.shape[0])
    varlist = df.columns.values.tolist()

    # -- maximum number of jet pairs to consider in each event
    # -- can be set to whatever number makes sense
    #max_length = max([len(b) for b in df['Delta_eta_jb']]) + 1
    max_length = max_n_pairs + 1
    logger.info(
        'The max number of jet pairs per event will be {}'.format(max_n_pairs))

    X_train, X_test, y_train, y_test, w_train, w_test,\
    sample_train, sample_test, ix_train, ix_test, scaler_list = shuffle_split_scale_pad(
        X, y_long, w, sample, ix, ftrain, max_length
    )

    logger.info('Saving processed data as hdf5 in data/')
    io.save(
        os.path.join('data', 'train_dict.hdf5'), {
            'X': X_train,
            'y': y_train,
            'w': w_train,
            'ix': ix_train,
            'vars': varlist,
            'sample': sample_train.tolist(),
            'scalers': scaler_list
        })

    io.save(
        os.path.join('data', 'test_dict.hdf5'), {
            'X': X_test,
            'y': y_test,
            'w': w_test,
            'ix': ix_test,
            'vars': varlist,
            'sample': sample_test.tolist(),
            'scalers': scaler_list
        })
                    'jet_trk_chi2', 'jet_trk_nInnHits',
                    'jet_trk_nNextToInnHits', 'jet_trk_nBLHits',
                    'jet_trk_nsharedBLHits', 'jet_trk_nsplitBLHits',
                    'jet_trk_nPixHits', 'jet_trk_nsharedPixHits',
                    'jet_trk_nsplitPixHits', 'jet_trk_nSCTHits',
                    'jet_trk_nsharedSCTHits', 'jet_trk_expectBLayerHit', 
                    #'jet_trk_dPhi'] # more to be added in `process_data`
                    'jet_trk_phi'] # more to be added in `process_data`

    cut_vars = ['jet_eta', 'jet_pt', 'jet_JVT', 'jet_aliveAfterOR'] # only necessary to remove bad jets

     # -- load and process training set
    print 'Loading training dataframe...'
    trk_train = pup.root2panda(
        './data/Dan/NOtrkSel/train_NOtrkSel.root', 
        'bTag_AntiKt4EMTopoJets', 
        branches = track_inputs + cut_vars + ['jet_LabDr_HadF' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta']
    )
    print 'Processing training sample ...'
    train_dict = process_data(trk_train, cut_vars, savevars=True)
    del trk_train
    io.save('./data/train_dict_IPConv_ntuple_MyTrkSel.h5', train_dict)

    # -- load and process test set
    print 'Loading test dataframe...'
    trk_test  = pup.root2panda(
        './data/Dan/NOtrkSel/test/user.dguest.8493098.Akt4EMTo._000013_NOtrkSel.root', 
        'bTag_AntiKt4EMTopoJets', 
        branches = track_inputs + cut_vars + ['jet_LabDr_HadF' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta']
    )
    print 'Processing test sample...'
Beispiel #7
0
def main(inputfiles, treename='bTag_AntiKt2PV0TrackJets'):

    configure_logging()
    logger = logging.getLogger('ProcessTrackJetData')

    # -- import root files into df
    logger.info('Importing ROOT files into pandas dataframes')
    df = pup.root2panda(
        inputfiles,
        treename,
        branches=[
            'jet_pt', 'jet_eta', 'jet_phi', 'jet_m', 'jet_ip2d_pu',
            'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc',
            'jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z',
            'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t',
            'jet_sv1_sig3d', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx',
            'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_sig3d',
            'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz',
            'jet_aliveAfterOR', 'jet_aliveAfterORmu', 'jet_nConst',
            'jet_LabDr_HadF'
        ])

    # -- Insert default values, calculate MV2 variables from the branches in df
    logger.info('Creating MV2 variables')
    df = transformVars(df)

    # -- Flatten from event-flat to jet-flat
    # -- Before doing so, remove event-level variables such as PVx,y,z
    logger.info('Flattening dataframe')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()})

    # -- apply eta, pt, OR cuts from b-tagging recommendations
    logger.info('Applying cuts')
    df_flat = applycuts(df_flat)

    # -- build X, y, w
    # -- target values
    y = df_flat['jet_LabDr_HadF'].values

    # -- slice df by only keeping the 24 variables for MV2 training
    training_vars = [
        'jet_pt', 'abs(jet_eta)', 'jet_ip2', 'jet_ip2_c', 'jet_ip2_cu',
        'jet_ip3', 'jet_ip3_c', 'jet_ip3_cu', 'jet_sv1_ntrkv', 'jet_sv1_m',
        'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_Lxy', 'jet_sv1_L3d',
        'jet_sv1_sig3d', 'jet_sv1_dR', 'jet_jf_n2t', 'jet_jf_ntrkAtVx',
        'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_dR',
        'jet_jf_sig3d'
    ]
    X = df_flat[training_vars].as_matrix()
    logger.info(
        '2D pT and eta reweighting of charm and light to bottom distribution')
    w = reweight_to_b(X, y)

    X, y, w = remove_tau(X, y, w)

    # -- turn classes 0, 4, 5, 15 to 0, 1, 2, 3
    # le = LabelEncoder()
    # y = le.fit_transform(y)

    # -- randomly shuffle and split into train and test set
    logger.info('Shuffling and splitting')
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
        X, y, w, train_size=0.6)

    # -- save out to hdf5
    logger.info('Saving data to hdf5')
    io.save(open('train_data.h5', 'wb'), {
        'X': X_train,
        'y': y_train,
        'w': w_train
    })
    io.save(open('test_data.h5', 'wb'), {
        'X': X_test,
        'y': y_test,
        'w': w_test
    })
def process(i, filepath, yaml_file, model_id): 
    '''
    '''   
    import pandautils as pup

    # -- load branches from yaml file
    branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features(yaml_file)
    logger = logging.getLogger("ETL Service")

    # -- load root file to dataframe
    logger.info('Operating on {}'.format(filepath))
    logger.info('Creating dataframe...')
    df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches)

    # -- create MV2 input quantities, set default values
    logger.info('Transforming variables...')
    df = transformVars(df)

    # -- flatten to jet-flat structure
    logger.info('Flattening df...')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()})
    del df

    # --apply standard cuts on AntiKT4EMTopoJets
    logger.info('Applying cuts...')
    df_flat = apply_calojet_cuts(df_flat)

    # -- create numpy arrays for ML
    logger.info('Creating X, y, w, mv2c10...')
    y = df_flat['jet_LabDr_HadF'].values
    mv2c10 = df_flat['jet_mv2c10'].values
    jet_pt = df_flat['jet_pt'].values
    ip3d_vars = df_flat[ip3d_training_vars].values
    ipmp_vars = df_flat[ipmp_training_vars].values

    # -- slice df by only keeping the training variables
    X = df_flat[training_vars].values

    # -- Find weights by reweighting to the light distribution
    pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values
    w = reweight_to_l(pteta, y, pt_col=0, eta_col=1)
    del df_flat, pteta

    # -- shuffle data, split into train and test
    logger.info('Shuffling, splitting, scaling...')
    ix = np.array(range(len(y)))
    X_train, X_test,\
    y_train, y_test,\
    w_train, w_test,\
    ix_train, ix_test, \
    mv2c10_train, mv2c10_test,\
    jet_pt_train, jet_pt_test,\
    ip3d_vars_train, ip3d_vars_test,\
    ipmp_vars_train, ipmp_vars_test = train_test_split(
        X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6
    )

    # -- scale inputs to 0 mean, 1 std
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    ip3d_vars_train = scaler.fit_transform(ip3d_vars_train)
    ip3d_vars_test = scaler.transform(ip3d_vars_test)
    ipmp_vars_train = scaler.fit_transform(ipmp_vars_train)
    ipmp_vars_test = scaler.transform(ipmp_vars_test)

    # -- split the previously selected training data into train and validate
    X_train, X_validate,\
    y_train, y_validate,\
    w_train, w_validate,\
    ix_train, ix_validate,\
    mv2c10_train, mv2c10_validate,\
    jet_pt_train, jet_pt_validate,\
    ip3d_vars_train, ip3d_vars_validate,\
    ipmp_vars_train, ipmp_vars_validate = train_test_split(
        X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7
    )

    # -- assign train, test, validate data to dictionaries
    train = {
        'X' : X_train,
        'ip3d_vars': ip3d_vars_train,
        'ipmp_vars': ipmp_vars_train,
        'y' : y_train,
        'w' : w_train,
        'ix': ix_train,
        'mv2c10': mv2c10_train,
        'pt': jet_pt_train
    }

    test = {
        'X' : X_test,
        'ip3d_vars': ip3d_vars_test,
        'ipmp_vars': ipmp_vars_test,
        'y' : y_test,
        'w' : w_test,
        'ix': ix_test,
        'mv2c10': mv2c10_test,
        'pt': jet_pt_test
    }

    validate = {
        'X' : X_validate,
        'ip3d_vars': ip3d_vars_validate,
        'ipmp_vars': ipmp_vars_validate,
        'y' : y_validate,
        'w' : w_validate,
        'ix': ix_validate,
        'mv2c10': mv2c10_validate,
        'pt': jet_pt_validate
    }

    # -- save dictionaries to hdf5
    logger.info('Saving dictionaries to hdf5...')
    hdf5_train_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-train-db.h5')
    hdf5_test_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-test-db.h5')
    hdf5_validate_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-validate-db.h5')

    io.save(hdf5_train_path, train)
    io.save(hdf5_test_path, test)
    io.save(hdf5_validate_path, validate)
    logger.debug('Saved hdf5 archives: {}, {}, {}'. format(hdf5_train_path, hdf5_test_path, hdf5_validate_path))

    return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
Beispiel #9
0
def main(iptagger, root_paths, model_id):
    configure_logging()
    logger = logging.getLogger("Combine_MV2IP")
    logger.info("Running on: {}".format(iptagger))

    branches, training_vars = set_features(iptagger)
    logger.info('Creating dataframe...')
    df = pup.root2panda('../data/final_production/*',
                        'bTag_AntiKt4EMTopoJets',
                        branches=branches)

    logger.info('Transforming variables...')
    df = transformVars(df, iptagger)

    logger.info('Flattening df...')
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()})
    del df

    logger.info('Applying cuts...')
    df_flat = apply_calojet_cuts(df_flat)

    logger.info('Will train on {}'.format(training_vars))
    logger.info('Creating X, y, w, mv2c10...')
    y = df_flat['jet_LabDr_HadF'].values
    mv2c10 = df_flat['jet_mv2c10'].values
    # -- slice df by only keeping the training variables
    X = df_flat[training_vars].values
    pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values
    #w = reweight_to_b(pteta, y, pt_col=0, eta_col=1)
    w = reweight_to_l(pteta, y, pt_col=0, eta_col=1)
    del df_flat, pteta

    logger.info('Shuffling, splitting, scaling...')
    ix = np.array(range(len(y)))
    X_train, X_test, y_train, y_test, w_train, w_test, \
    ix_train, ix_test, mv2c10_train, mv2c10_test = train_test_split(
        X, y, w, ix, mv2c10, train_size=0.6
    )
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    le = LabelEncoder()
    net = Sequential()
    net.add(Dense(16, input_shape=(X_train.shape[1], ), activation='relu'))
    net.add(Dropout(0.2))
    net.add(Dense(4, activation='softmax'))
    net.summary()
    net.compile('adam',
                'sparse_categorical_crossentropy',
                metrics=['accuracy'])

    weights_path = iptagger + '-' + model_id + '-progress.h5'
    try:
        logger.info('Trying to load weights from ' + weights_path)
        net.load_weights(weights_path)
        logger.info('Weights found and loaded from ' + weights_path)
    except IOError:
        logger.info('No weights found in ' + weights_path)

    # -- train
    try:
        net.fit(X_train,
                le.fit_transform(y_train),
                verbose=True,
                batch_size=64,
                sample_weight=w_train,
                callbacks=[
                    EarlyStopping(verbose=True,
                                  patience=100,
                                  monitor='val_loss'),
                    ModelCheckpoint(weights_path,
                                    monitor='val_loss',
                                    verbose=True,
                                    save_best_only=True)
                ],
                nb_epoch=200,
                validation_split=0.3)
    except KeyboardInterrupt:
        print '\n Stopping early.'

    # -- load in best network
    net.load_weights(weights_path)

    # -- test
    print 'Testing...'
    yhat = net.predict(X_test, verbose=True)

    # -- save the predicions to numpy file
    np.save('yhat-{}-{}.npy'.format(iptagger, model_id), yhat)
    test = {'X': X_test, 'y': y_test, 'w': w_test, 'mv2c10': mv2c10_test}
    # -- plot performance
    performance(yhat, test, iptagger)
def main(weights, picklename, filename, treename='bTag_AntiKt2PV0TrackJets'):
    '''
    evaluate the tmva method after transforming input data into right format
    Args:
    -----
        weights:    .xml file out of mv2 training containing bdt parameters
        picklename: name of the output pickle to store new mv2 values
        filename:   .root file with ntuples used to evaluate the tmva method
        treename:   (optional) name of the TTree to consider 
    Returns:
    --------
        status
    Raises:
    -------
        nothing yet, but to be improved
    '''
    print 'Parsing XML file...'
    # -- Load XML file
    tree = ET.parse(weights)
    root = tree.getroot()

    # -- Get list of variable names from XML file
    var_list = [
        var.attrib['Label']
        for var in root.findall('Variables')[0].findall('Variable')
    ]

    # -- Count the input variables that go into MV2:
    n_vars = len(var_list)

    print 'Loading .root file for evaluation...'
    # -- Get ntuples:
    df = pup.root2panda(
        filename,
        treename,
        branches=[
            'jet_pt', 'jet_eta', 'jet_phi', 'jet_m', 'jet_ip2d_pu',
            'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc',
            'jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z',
            'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t',
            'jet_sv1_sig3d', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx',
            'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_sig3d',
            'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz'
        ])

    # -- Insert default values, calculate MV2 variables from the branches in df
    df = transformVars(df)

    # -- Map ntuple names to var_list
    names_mapping = {
        'pt': 'jet_pt',
        'abs(eta)': 'abs(jet_eta)',
        'ip2': 'jet_ip2',
        'ip2_c': 'jet_ip2_c',
        'ip2_cu': 'jet_ip2_cu',
        'ip3': 'jet_ip3',
        'ip3_c': 'jet_ip3_c',
        'ip3_cu': 'jet_ip3_cu',
        'sv1_ntkv': 'jet_sv1_ntrkv',
        'sv1_mass': 'jet_sv1_m',
        'sv1_efrc': 'jet_sv1_efc',
        'sv1_n2t': 'jet_sv1_n2t',
        'sv1_Lxy': 'jet_sv1_Lxy',
        'sv1_L3d': 'jet_sv1_L3d',
        'sv1_sig3': 'jet_sv1_sig3d',
        'sv1_dR': 'jet_sv1_dR',
        'jf_n2tv': 'jet_jf_n2t',
        'jf_ntrkv': 'jet_jf_ntrkAtVx',
        'jf_nvtx': 'jet_jf_nvtx',
        'jf_nvtx1t': 'jet_jf_nvtx1t',
        'jf_mass': 'jet_jf_m',
        'jf_efrc': 'jet_jf_efc',
        'jf_dR': 'jet_jf_dR',
        'jf_sig3': 'jet_jf_sig3d'
    }

    print 'Initializing TMVA...'
    # -- TMVA: Initialize reader, add empty variables and weights from training
    reader = TMVA.Reader()
    for n in range(n_vars):
        reader.AddVariable(var_list[n], array('f', [0]))
    reader.BookMVA('BDTG akt2', weights)

    print 'Creating feature matrix...'
    # -- Get features for each event and store them in X_test
    X_buf = []
    for event in df[[names_mapping[var] for var in var_list]].values:
        X_buf.extend(
            np.array([normalize_type(jet) for jet in event]).T.tolist())
    X_test = np.array(X_buf)

    print 'Evaluating!'
    # -- TMVA: Evaluate!
    twoclass_output = evaluate_reader(reader, 'BDTG akt2', X_test)

    # -- Reshape the MV2 output into event-jet format
    reorganized = match_shape(twoclass_output, df['jet_pt'])

    import cPickle
    print 'Saving new MV2 weights in {}'.format(picklename)
    cPickle.dump(reorganized, open(picklename, 'wb'))

    # -- Write the new branch to the tree (currently de-activated)
    #add_branch(reorganized, filename, treename, 'jet_mv2c20_new')

    print 'Done. Success!'
    return 0
def main(iptagger, root_paths, model_id):
    configure_logging()
    logger = logging.getLogger("Combine_MV2IP")
    logger.info("Running on: {}".format(iptagger))

    branches, training_vars = set_features(iptagger)
    logger.info('Creating dataframe...')
    df = pup.root2panda('../data/final_production/*', 
        'bTag_AntiKt4EMTopoJets', 
        branches = branches)

    logger.info('Transforming variables...')
    df = transformVars(df, iptagger)

    logger.info('Flattening df...')
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()})
    del df

    logger.info('Applying cuts...')
    df_flat = apply_calojet_cuts(df_flat)

    logger.info('Will train on {}'. format(training_vars))
    logger.info('Creating X, y, w, mv2c10...')
    y = df_flat['jet_LabDr_HadF'].values
    mv2c10 = df_flat['jet_mv2c10'].values
    # -- slice df by only keeping the training variables
    X = df_flat[training_vars].values
    pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values
    #w = reweight_to_b(pteta, y, pt_col=0, eta_col=1)
    w = reweight_to_l(pteta, y, pt_col=0, eta_col=1)
    del df_flat, pteta

    logger.info('Shuffling, splitting, scaling...')
    ix = np.array(range(len(y)))
    X_train, X_test, y_train, y_test, w_train, w_test, \
    ix_train, ix_test, mv2c10_train, mv2c10_test = train_test_split(
        X, y, w, ix, mv2c10, train_size=0.6
    )
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    le = LabelEncoder()
    net = Sequential()
    net.add(Dense(16, input_shape=(X_train.shape[1], ), activation='relu'))
    net.add(Dropout(0.2))
    net.add(Dense(4, activation='softmax'))
    net.summary()
    net.compile('adam', 'sparse_categorical_crossentropy', metrics=['accuracy'])

    weights_path = iptagger + '-' + model_id + '-progress.h5'
    try:
        logger.info('Trying to load weights from ' + weights_path)
        net.load_weights(weights_path)
        logger.info('Weights found and loaded from ' + weights_path)
    except IOError:
        logger.info('No weights found in ' + weights_path)

    # -- train 
    try:
        net.fit(
            X_train, le.fit_transform(y_train),
            verbose=True, 
            batch_size=64, 
            sample_weight=w_train,
            callbacks = [
                EarlyStopping(verbose=True, patience=100, monitor='val_loss'),
                ModelCheckpoint(weights_path, monitor='val_loss', verbose=True, save_best_only=True)
            ],
            nb_epoch=200, 
            validation_split=0.3
        ) 
    except KeyboardInterrupt:
        print '\n Stopping early.'

    # -- load in best network
    net.load_weights(weights_path)

    # -- test
    print 'Testing...'
    yhat = net.predict(X_test, verbose=True) 

    # -- save the predicions to numpy file
    np.save('yhat-{}-{}.npy'.format(iptagger, model_id), yhat)
    test = {
        'X' : X_test,
        'y' : y_test,
        'w' : w_test,
        'mv2c10' : mv2c10_test
    }
    # -- plot performance
    performance(yhat, test, iptagger)
Beispiel #12
0
        default=30, type=int, 
        help="Maximum number of tracks per event. \
        If the event has fewer tracks, use padding; if is has more, only consider the first ntrk")
    parser.add_argument('--inputs', 
        default='grade', 
        help='one of: hits, grade')
    args = parser.parse_args()

    track_inputs, jet_inputs = generate_inputlist(args.inputs)


    print 'Loading dataframes...'
    # -- currently only training and testing on one file each!
    trk_train = pup.root2panda(
        os.path.join('data', 'train', args.train_files), 
        'bTag_AntiKt4EMTopoJets', 
        branches = track_inputs + jet_inputs
    )
    trk_test  = pup.root2panda(
        os.path.join('data', 'test', args.test_files), 
        'bTag_AntiKt4EMTopoJets', 
        branches = track_inputs + jet_inputs
    )
    print 'Processing training sample ...'
    train_dict = process_data(trk_train, jet_inputs, args.ntrk, args.sort_by, args.output, args.inputs, savevars=True)
    del trk_train
    io.save(os.path.join('data', 'train_dict_' + args.output + '.h5'), train_dict)

    print 'Processing test sample...'
    test_dict = process_data(trk_test, jet_inputs, args.ntrk, args.sort_by, args.output, args.inputs,)
    del trk_test
Beispiel #13
0
    track_inputs = [
        'jet_trk_pt', 'jet_trk_d0', 'jet_trk_z0', 'jet_trk_d0sig',
        'jet_trk_z0sig', 'jet_trk_chi2', 'jet_trk_nInnHits',
        'jet_trk_nNextToInnHits', 'jet_trk_nBLHits', 'jet_trk_nsharedBLHits',
        'jet_trk_nsplitBLHits', 'jet_trk_nPixHits', 'jet_trk_nsharedPixHits',
        'jet_trk_nsplitPixHits', 'jet_trk_nSCTHits', 'jet_trk_nsharedSCTHits',
        'jet_trk_expectBLayerHit'
    ]  # 2 more to be added in `process_data`

    print 'Loading dataframes...'
    # -- currently only training and testing on one file each!
    trk_train = pup.root2panda(
        './data/train/*410000_00*.root',
        'JetCollection',
        branches=track_inputs + [
            'jet_truthflav', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc',
            'jet_phi', 'jet_trk_phi'
        ])

    trk_test = pup.root2panda(
        './data/test/*410000*.root',
        'JetCollection',
        branches=track_inputs + [
            'jet_truthflav', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc',
            'jet_phi', 'jet_trk_phi'
        ])

    print 'Processing training sample ...'
    train_dict = process_data(trk_train, savevars=True)
    del trk_train
def main(weights, picklename, filename, treename = 'bTag_AntiKt2PV0TrackJets'):
    '''
    evaluate the tmva method after transforming input data into right format
    Args:
    -----
        weights:    .xml file out of mv2 training containing bdt parameters
        picklename: name of the output pickle to store new mv2 values
        filename:   .root file with ntuples used to evaluate the tmva method
        treename:   (optional) name of the TTree to consider 
    Returns:
    --------
        status
    Raises:
    -------
        nothing yet, but to be improved
    '''
    print 'Parsing XML file...'
    # -- Load XML file
    tree = ET.parse(weights) 
    root = tree.getroot()

    # -- Get list of variable names from XML file
    var_list = [var.attrib['Label'] for var in root.findall('Variables')[0].findall('Variable')]

    # -- Count the input variables that go into MV2:
    n_vars = len(var_list)
    
    
    print 'Loading .root file for evaluation...'
    # -- Get ntuples:
    df = pup.root2panda(filename, treename, branches = ['jet_pt', 'jet_eta','jet_phi', 'jet_m', 'jet_ip2d_pu', 
        'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc','jet_ip3d_pb',
        'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv',
        'jet_sv1_m','jet_sv1_efc','jet_sv1_n2t','jet_sv1_sig3d',
        'jet_jf_n2t','jet_jf_ntrkAtVx','jet_jf_nvtx','jet_jf_nvtx1t','jet_jf_m',
        'jet_jf_efc','jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz' ])

    # -- Insert default values, calculate MV2 variables from the branches in df
    df = transformVars(df)

    # -- Map ntuple names to var_list
    names_mapping = {
        'pt':'jet_pt',
        'abs(eta)':'abs(jet_eta)',
        'ip2':'jet_ip2',
        'ip2_c':'jet_ip2_c',
        'ip2_cu':'jet_ip2_cu',
        'ip3':'jet_ip3',
        'ip3_c':'jet_ip3_c',
        'ip3_cu':'jet_ip3_cu',
        'sv1_ntkv':'jet_sv1_ntrkv',
        'sv1_mass':'jet_sv1_m',
        'sv1_efrc':'jet_sv1_efc',
        'sv1_n2t':'jet_sv1_n2t',
        'sv1_Lxy':'jet_sv1_Lxy',
        'sv1_L3d':'jet_sv1_L3d',
        'sv1_sig3':'jet_sv1_sig3d',
        'sv1_dR': 'jet_sv1_dR',
        'jf_n2tv':'jet_jf_n2t',
        'jf_ntrkv':'jet_jf_ntrkAtVx',
        'jf_nvtx':'jet_jf_nvtx',
        'jf_nvtx1t':'jet_jf_nvtx1t',
        'jf_mass':'jet_jf_m',
        'jf_efrc':'jet_jf_efc',
        'jf_dR':'jet_jf_dR',
        'jf_sig3':'jet_jf_sig3d' 
    }

    print 'Initializing TMVA...'
    # -- TMVA: Initialize reader, add empty variables and weights from training
    reader = TMVA.Reader()
    for n in range(n_vars):
        reader.AddVariable(var_list[n], array('f', [0] ) )
    reader.BookMVA('BDTG akt2', weights) 

    print 'Creating feature matrix...'
    # -- Get features for each event and store them in X_test
    X_buf = []
    for event in df[[names_mapping[var] for var in var_list]].values:
        X_buf.extend(np.array([normalize_type(jet) for jet in event]).T.tolist())
    X_test = np.array(X_buf)

    print 'Evaluating!'
    # -- TMVA: Evaluate!
    twoclass_output = evaluate_reader(reader, 'BDTG akt2', X_test)

    # -- Reshape the MV2 output into event-jet format
    reorganized = match_shape(twoclass_output, df['jet_pt'])

    import cPickle
    print 'Saving new MV2 weights in {}'.format(picklename)
    cPickle.dump(reorganized, open(picklename, 'wb')) 
    
    # -- Write the new branch to the tree (currently de-activated)
    #add_branch(reorganized, filename, treename, 'jet_mv2c20_new')

    print 'Done. Success!'
    return 0
Beispiel #15
0
def process(i, filepath, yaml_file, model_id):
    '''
    '''
    import pandautils as pup

    # -- load branches from yaml file
    branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features(
        yaml_file)
    logger = logging.getLogger("ETL Service")

    # -- load root file to dataframe
    logger.info('Operating on {}'.format(filepath))
    logger.info('Creating dataframe...')
    df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches)

    # -- create MV2 input quantities, set default values
    logger.info('Transforming variables...')
    df = transformVars(df)

    # -- flatten to jet-flat structure
    logger.info('Flattening df...')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()})
    del df

    # --apply standard cuts on AntiKT4EMTopoJets
    logger.info('Applying cuts...')
    df_flat = apply_calojet_cuts(df_flat)

    # -- create numpy arrays for ML
    logger.info('Creating X, y, w, mv2c10...')
    y = df_flat['jet_LabDr_HadF'].values
    mv2c10 = df_flat['jet_mv2c10'].values
    jet_pt = df_flat['jet_pt'].values
    ip3d_vars = df_flat[ip3d_training_vars].values
    ipmp_vars = df_flat[ipmp_training_vars].values

    # -- slice df by only keeping the training variables
    X = df_flat[training_vars].values

    # -- Find weights by reweighting to the light distribution
    pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values
    w = reweight_to_l(pteta, y, pt_col=0, eta_col=1)
    del df_flat, pteta

    # -- shuffle data, split into train and test
    logger.info('Shuffling, splitting, scaling...')
    ix = np.array(range(len(y)))
    X_train, X_test,\
    y_train, y_test,\
    w_train, w_test,\
    ix_train, ix_test, \
    mv2c10_train, mv2c10_test,\
    jet_pt_train, jet_pt_test,\
    ip3d_vars_train, ip3d_vars_test,\
    ipmp_vars_train, ipmp_vars_test = train_test_split(
        X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6
    )

    # -- scale inputs to 0 mean, 1 std
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    ip3d_vars_train = scaler.fit_transform(ip3d_vars_train)
    ip3d_vars_test = scaler.transform(ip3d_vars_test)
    ipmp_vars_train = scaler.fit_transform(ipmp_vars_train)
    ipmp_vars_test = scaler.transform(ipmp_vars_test)

    # -- split the previously selected training data into train and validate
    X_train, X_validate,\
    y_train, y_validate,\
    w_train, w_validate,\
    ix_train, ix_validate,\
    mv2c10_train, mv2c10_validate,\
    jet_pt_train, jet_pt_validate,\
    ip3d_vars_train, ip3d_vars_validate,\
    ipmp_vars_train, ipmp_vars_validate = train_test_split(
        X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7
    )

    # -- assign train, test, validate data to dictionaries
    train = {
        'X': X_train,
        'ip3d_vars': ip3d_vars_train,
        'ipmp_vars': ipmp_vars_train,
        'y': y_train,
        'w': w_train,
        'ix': ix_train,
        'mv2c10': mv2c10_train,
        'pt': jet_pt_train
    }

    test = {
        'X': X_test,
        'ip3d_vars': ip3d_vars_test,
        'ipmp_vars': ipmp_vars_test,
        'y': y_test,
        'w': w_test,
        'ix': ix_test,
        'mv2c10': mv2c10_test,
        'pt': jet_pt_test
    }

    validate = {
        'X': X_validate,
        'ip3d_vars': ip3d_vars_validate,
        'ipmp_vars': ipmp_vars_validate,
        'y': y_validate,
        'w': w_validate,
        'ix': ix_validate,
        'mv2c10': mv2c10_validate,
        'pt': jet_pt_validate
    }

    # -- save dictionaries to hdf5
    logger.info('Saving dictionaries to hdf5...')
    hdf5_train_path = os.path.join('..', 'data',
                                   'DL1-' + model_id + str(i) + '-train-db.h5')
    hdf5_test_path = os.path.join('..', 'data',
                                  'DL1-' + model_id + str(i) + '-test-db.h5')
    hdf5_validate_path = os.path.join(
        '..', 'data', 'DL1-' + model_id + str(i) + '-validate-db.h5')

    io.save(hdf5_train_path, train)
    io.save(hdf5_test_path, test)
    io.save(hdf5_validate_path, validate)
    logger.debug('Saved hdf5 archives: {}, {}, {}'.format(
        hdf5_train_path, hdf5_test_path, hdf5_validate_path))

    return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
def process(i, filepath, yaml_file):
    '''
    '''
    import pandautils as pup

    branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features(
        yaml_file)
    logger = logging.getLogger("ETL Service")
    logger.info('Operating on {}'.format(filepath))
    logger.info('Creating dataframe...')
    df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches)

    logger.info('Transforming variables...')
    df = transformVars(df)

    logger.info('Flattening df...')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()})
    del df

    logger.info('Applying cuts...')
    df_flat = apply_calojet_cuts(df_flat)

    logger.info('Creating X, y, w, mv2c10...')
    y = df_flat['jet_LabDr_HadF'].values
    mv2c10 = df_flat['jet_mv2c10'].values
    jet_pt = df_flat['jet_pt'].values
    ip3d_vars = df_flat[ip3d_training_vars].values
    ipmp_vars = df_flat[ipmp_training_vars].values
    # -- slice df by only keeping the training variables
    X = df_flat[training_vars].values

    # -- Find weights by reweighting to the light distribution
    # -- TO DO: pass the pt and eta columns directly, instead of passing their indices
    pt_col = np.argwhere(np.array(training_vars) == 'jet_pt')[0][0]
    eta_col = np.argwhere(np.array(training_vars) == 'abs(jet_eta)')[0][0]
    #w = reweight_to_b(X, y, pt_col, eta_col)
    w = reweight_to_l(X, y, pt_col, eta_col)
    del df_flat

    logger.info('Shuffling, splitting, scaling...')
    ix = np.array(range(len(y)))
    X_train, X_test, y_train, y_test, w_train, w_test, ix_train, ix_test, \
    mv2c10_train, mv2c10_test, jet_pt_train, jet_pt_test, \
    ip3d_vars_train, ip3d_vars_test, ipmp_vars_train, ipmp_vars_test = train_test_split(
        X,
        y,
        w,
        ix,
        mv2c10,
        jet_pt,
        ip3d_vars,
        ipmp_vars,
        train_size=0.6)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    ip3d_vars_train = scaler.fit_transform(ip3d_vars_train)
    ip3d_vars_test = scaler.transform(ip3d_vars_test)
    ipmp_vars_train = scaler.fit_transform(ipmp_vars_train)
    ipmp_vars_test = scaler.transform(ipmp_vars_test)

    X_train, X_validate, y_train, y_validate, w_train, w_validate, ix_train, ix_validate, \
    mv2c10_train, mv2c10_validate, jet_pt_train, jet_pt_validate, \
    ip3d_vars_train, ip3d_vars_validate, ipmp_vars_train, ipmp_vars_validate = train_test_split(
        X_train,
        y_train,
        w_train,
        ix_train,
        mv2c10_train,
        jet_pt_train,
        ip3d_vars_train,
        ipmp_vars_train,
        train_size=0.7)

    train = {
        'X': X_train,
        'ip3d_vars': ip3d_vars_train,
        'ipmp_vars': ipmp_vars_train,
        'y': y_train,
        'w': w_train,
        'ix': ix_train,
        'mv2c10': mv2c10_train,
        'pt': jet_pt_train
    }

    test = {
        'X': X_test,
        'ip3d_vars': ip3d_vars_test,
        'ipmp_vars': ipmp_vars_test,
        'y': y_test,
        'w': w_test,
        'ix': ix_test,
        'mv2c10': mv2c10_test,
        'pt': jet_pt_test
    }

    validate = {
        'X': X_validate,
        'ip3d_vars': ip3d_vars_validate,
        'ipmp_vars': ipmp_vars_validate,
        'y': y_validate,
        'w': w_validate,
        'ix': ix_validate,
        'mv2c10': mv2c10_validate,
        'pt': jet_pt_validate
    }

    logger.info('Saving dictionaries to hdf5...')
    hdf5_train_path = os.path.join('..', 'data',
                                   'DL1-' + OUTNAME + str(i) + '-train-db.h5')
    hdf5_test_path = os.path.join('..', 'data',
                                  'DL1-' + OUTNAME + str(i) + '-test-db.h5')
    hdf5_validate_path = os.path.join(
        '..', 'data', 'DL1-' + OUTNAME + str(i) + '-validate-db.h5')

    io.save(hdf5_train_path, train)
    io.save(hdf5_test_path, test)
    io.save(hdf5_validate_path, validate)
    logger.debug('Saved hdf5 archives: {}, {}, {}'.format(
        hdf5_train_path, hdf5_test_path, hdf5_validate_path))

    return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
Beispiel #17
0
if __name__ == '__main__':

    track_inputs = ['jet_trk_pt', 'jet_trk_d0',
                    'jet_trk_z0', 'jet_trk_d0sig', 'jet_trk_z0sig',
                    'jet_trk_chi2', 'jet_trk_nInnHits',
                    'jet_trk_nNextToInnHits', 'jet_trk_nBLHits',
                    'jet_trk_nsharedBLHits', 'jet_trk_nsplitBLHits',
                    'jet_trk_nPixHits', 'jet_trk_nsharedPixHits',
                    'jet_trk_nsplitPixHits', 'jet_trk_nSCTHits',
                    'jet_trk_nsharedSCTHits', 'jet_trk_expectBLayerHit'] # 2 more to be added in `process_data`

    print 'Loading dataframes...'
    # -- currently only training and testing on one file each!
    trk_train = pup.root2panda(
        './data/train/*410000_00*.root', 
        'JetCollection', 
        branches = track_inputs + ['jet_truthflav' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi']
    )

    trk_test  = pup.root2panda(
        './data/test/*410000*.root', 
        'JetCollection', 
        branches = track_inputs + ['jet_truthflav' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi']
    )

    print 'Processing training sample ...'
    train_dict = process_data(trk_train, savevars=True)
    del trk_train
    io.save('./data/train_dict_IPConv.h5', train_dict)

    print 'Processing test sample...'
def main(inputfiles, treename='bTag_AntiKt2PV0TrackJets'):

    configure_logging()
    logger = logging.getLogger('ProcessTrackJetData')

    # -- import root files into df
    logger.info('Importing ROOT files into pandas dataframes')
    df = pup.root2panda(inputfiles, treename, branches = [
            'jet_pt', 'jet_eta','jet_phi', 'jet_m', 'jet_ip2d_pu', 
            'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc','jet_ip3d_pb',
            'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv',
            'jet_sv1_m','jet_sv1_efc','jet_sv1_n2t','jet_sv1_sig3d',
            'jet_jf_n2t','jet_jf_ntrkAtVx','jet_jf_nvtx','jet_jf_nvtx1t','jet_jf_m',
            'jet_jf_efc','jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz',
            'jet_aliveAfterOR', 'jet_aliveAfterORmu', 'jet_nConst', 'jet_LabDr_HadF'])

    # -- Insert default values, calculate MV2 variables from the branches in df
    logger.info('Creating MV2 variables')
    df = transformVars(df)

    # -- Flatten from event-flat to jet-flat
    # -- Before doing so, remove event-level variables such as PVx,y,z
    logger.info('Flattening dataframe')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()})

    # -- apply eta, pt, OR cuts from b-tagging recommendations
    logger.info('Applying cuts')
    df_flat = applycuts(df_flat)

    # -- build X, y, w
    # -- target values
    y = df_flat['jet_LabDr_HadF'].values

    # -- slice df by only keeping the 24 variables for MV2 training
    training_vars = [
        'jet_pt', 
        'abs(jet_eta)', 
        'jet_ip2',
        'jet_ip2_c',
        'jet_ip2_cu',
        'jet_ip3',
        'jet_ip3_c',
        'jet_ip3_cu',
        'jet_sv1_ntrkv',
        'jet_sv1_m',
        'jet_sv1_efc',
        'jet_sv1_n2t',
        'jet_sv1_Lxy',
        'jet_sv1_L3d',
        'jet_sv1_sig3d',
        'jet_sv1_dR',
        'jet_jf_n2t',
        'jet_jf_ntrkAtVx',
        'jet_jf_nvtx',
        'jet_jf_nvtx1t',
        'jet_jf_m',
        'jet_jf_efc',
        'jet_jf_dR',
        'jet_jf_sig3d'] 
    X = df_flat[training_vars].as_matrix()
    logger.info('2D pT and eta reweighting of charm and light to bottom distribution')
    w = reweight_to_b(X, y)

    X, y, w = remove_tau(X, y, w)

    # -- turn classes 0, 4, 5, 15 to 0, 1, 2, 3
    # le = LabelEncoder()
    # y = le.fit_transform(y)

    # -- randomly shuffle and split into train and test set
    logger.info('Shuffling and splitting')
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, train_size = 0.6)

    # -- save out to hdf5
    logger.info('Saving data to hdf5')
    io.save(open('train_data.h5', 'wb'), {'X' : X_train, 'y' : y_train, 'w' : w_train})
    io.save(open('test_data.h5', 'wb'), {'X' : X_test, 'y' : y_test, 'w' : w_test})
Beispiel #19
0
        'jet_trk_nSCTHits',
        'jet_trk_nsharedSCTHits',
        'jet_trk_expectBLayerHit',
        #'jet_trk_dPhi'] # more to be added in `process_data`
        'jet_trk_phi'
    ]  # more to be added in `process_data`

    cut_vars = ['jet_eta', 'jet_pt', 'jet_JVT',
                'jet_aliveAfterOR']  # only necessary to remove bad jets

    # -- load and process training set
    print 'Loading training dataframe...'
    trk_train = pup.root2panda(
        './data/Dan/NOtrkSel/train_NOtrkSel.root',
        'bTag_AntiKt4EMTopoJets',
        branches=track_inputs + cut_vars + [
            'jet_LabDr_HadF', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc',
            'jet_phi', 'jet_trk_theta'
        ])
    print 'Processing training sample ...'
    train_dict = process_data(trk_train, cut_vars, savevars=True)
    del trk_train
    io.save('./data/train_dict_IPConv_ntuple_MyTrkSel.h5', train_dict)

    # -- load and process test set
    print 'Loading test dataframe...'
    trk_test = pup.root2panda(
        './data/Dan/NOtrkSel/test/user.dguest.8493098.Akt4EMTo._000013_NOtrkSel.root',
        'bTag_AntiKt4EMTopoJets',
        branches=track_inputs + cut_vars + [
            'jet_LabDr_HadF', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc',