Example #1
0
def root2panda(files_path, tree_name, mask=False, **kwargs):
    '''
    Args:
    -----
        files_path: a string like './data/*.root', for example
        tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root 
                   file that we want to open
        kwargs: arguments taken by root2rec, such as branches to consider, etc
    Returns:
    --------    
        output_panda: a panda dataframe like allbkg_df in which all the info from the root file will be stored
    
    Note:
    -----
        if you are working with .root files that contain different branches, you might have to mask your data
        in that case, return pd.DataFrame(ss.data)
    '''

    files = glob.glob(files_path)

    # -- check whether a name was passed for the tree_name --> for root files with only one tree and no folders,
    # -- you do not need to specify any name (I believe)
    if (tree_name == ''):
        ss = stack_arrays([root2rec(fpath, **kwargs) for fpath in files])
    else:
        ss = stack_arrays(
            [root2rec(fpath, tree_name, **kwargs) for fpath in files])

    if (mask):
        return pd.DataFrame(ss.data)
    else:
        try:
            return pd.DataFrame(ss)
        except Exception, e:
            return pd.DataFrame(ss.data)
Example #2
0
def readFiles():
    print 'Reading files...'

    weightsS = root2rec(files_signal, treename='tree', branches=['full_weight'], selection=selection)['full_weight']
    weightsB = root2rec(files_bg, treename='tree', branches=['full_weight'], selection=selection)['full_weight']

    sum_weightsS = np.sum(weightsS)
    sum_weightsB = np.sum(weightsB)

    weightsB = weightsB * sum_weightsS/sum_weightsB

    nS = len(weightsS)
    nB = len(weightsB)

    fullWeight = np.concatenate((weightsS, weightsB))
    # fullWeight = fullWeight['weight']

    # fullWeight = np.ones(len(fullWeight))

    # del weightsS, weightsB

    arrSB = root2array(files_signal + files_bg, treename='tree', branches=trainVars(), selection=selection)

    # Need a matrix-like array instead of a 1-D array of lists for sklearn
    arrSB = (np.asarray([arrSB[var] for var in trainVars()])).transpose()

    targets = np.concatenate((np.ones(nS),np.zeros(nB)))

    print 'Done reading files.'

    return arrSB, fullWeight, targets
Example #3
0
def root2panda(files_path, tree_name, mask = False, **kwargs):
    '''
    Args:
    -----
        files_path: a string like './data/*.root', for example
        tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root 
                   file that we want to open
        kwargs: arguments taken by root2rec, such as branches to consider, etc
    Returns:
    --------    
        output_panda: a panda dataframe like allbkg_df in which all the info from the root file will be stored
    
    Note:
    -----
        if you are working with .root files that contain different branches, you might have to mask your data
        in that case, return pd.DataFrame(ss.data)
    '''
    
    files = glob.glob(files_path)

    # -- check whether a name was passed for the tree_name --> for root files with only one tree and no folders, 
    # -- you do not need to specify any name (I believe)
    if (tree_name == ''):
        ss = stack_arrays([root2rec(fpath, **kwargs) for fpath in files])
    else:
        ss = stack_arrays([root2rec(fpath, tree_name, **kwargs) for fpath in files])
    
    if (mask):
        return pd.DataFrame(ss.data)
    else:
        try:
            return pd.DataFrame(ss)
        except Exception, e:
            return pd.DataFrame(ss.data)
Example #4
0
def test_slice():
    a = rnp.root2rec(load('single1.root'), stop=10)
    assert_equal(len(a), 10)
    assert_equal(a.n_int[-1], 10)

    a = rnp.root2rec(load('single1.root'), stop=11, start=1)
    assert_equal(len(a), 10)
    assert_equal(a.n_int[-1], 11)

    a = rnp.root2rec(load('single1.root'), stop=105, start=95)
    assert_equal(len(a), 5)
    assert_equal(a.n_int[-1], 100)
Example #5
0
def test_slice():
    a = rnp.root2rec(load('single1.root'), stop=10)
    assert_equal(len(a), 10)
    assert_equal(a.n_int[-1], 10)

    a = rnp.root2rec(load('single1.root'), stop=11, start=1)
    assert_equal(len(a), 10)
    assert_equal(a.n_int[-1], 11)

    a = rnp.root2rec(load('single1.root'), stop=105, start=95)
    assert_equal(len(a), 5)
    assert_equal(a.n_int[-1], 100)
Example #6
0
def test_selection_and_expression():
    ref = len(rnp.root2rec(
        load('test.root'), branches=['x', 'y'], selection='z>0'))
    assert_equal(ref,
        len(rnp.root2rec(
            load('test.root'), branches=['x', 'y', 'z'], selection='z>0')))
    assert_equal(ref,
        len(rnp.root2rec(
            load('test.root'), branches=['x', 'x*y'], selection='z>0')))
    assert_equal(ref,
        len(rnp.root2rec(
            load('test.root'), branches=['x', 'x*z'], selection='z>0')))
Example #7
0
def test_selection_and_expression():
    ref = len(rnp.root2rec(
        load('test.root'), branches=['x', 'y'], selection='z>0'))
    assert_equal(ref,
        len(rnp.root2rec(
            load('test.root'), branches=['x', 'y', 'z'], selection='z>0')))
    assert_equal(ref,
        len(rnp.root2rec(
            load('test.root'), branches=['x', 'x*y'], selection='z>0')))
    assert_equal(ref,
        len(rnp.root2rec(
            load('test.root'), branches=['x', 'x*z'], selection='z>0')))
Example #8
0
def root2panda(file_paths, tree_name, **kwargs):
    '''
    Args:
    -----
        files_path: a string like './data/*.root', for example
        tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root
                   file that we want to open
        kwargs: arguments taken by root2rec, such as branches to consider, etc
    Returns:
    --------
        output_panda: a panda dataframe like allbkg_df in which all the info from the root file will be stored

    Note:
    -----
        if you are working with .root files that contain different branches, you might have to mask your data
        in that case, return pd.DataFrame(ss.data)
    '''
    if isinstance(file_paths, basestring):
        files = glob.glob(file_paths)
    else:
        files = [matched_f for f in file_paths for matched_f in glob.glob(f)]

    ss = stack_arrays([root2rec(fpath, tree_name, **kwargs) for fpath in files])
    try:
        return pd.DataFrame(ss)
    except Exception:
        return pd.DataFrame(ss.data)
Example #9
0
def test_variable_length_arrays():
    f = load(['vary1.root', 'vary2.root'])
    a = rnp.root2rec(f)
    assert_equal(
        a.dtype,
        [('len_n', '<i4'), ('len_f', '<i4'), ('len_d', '<i4'),
         ('n_char', 'O'), ('n_uchar', 'O'),
         ('n_short', 'O'), ('n_ushort', 'O'),
         ('n_int', 'O'), ('n_uint', 'O'),
         ('n_long', 'O'), ('n_ulong', 'O'),
         ('f_float', 'O'), ('d_double', 'O'),
         ('n2_int', 'O'), ('f2_float', 'O'), ('d2_double', 'O')])

    # check lengths
    for i in range(len(a)):
        assert_equal(a.len_n[i], len(a.n_int[i]))
        assert_equal(a.len_f[i], len(a.f_float[i]))
        assert_equal(a.len_d[i], len(a.d_double[i]))

        assert_equal((a.len_n[i], 2), a.n2_int[i].shape)
        assert_equal((a.len_f[i], 3), a.f2_float[i].shape)
        assert_equal((a.len_d[i], 4), a.d2_double[i].shape)

    # check elements
    assert_equal(a.len_n[0], 0)
    assert_equal(a.len_f[0], 1)
    assert_equal(a.len_d[0], 2)
    assert_equal(a.n_int[-1][-1], 417)
    assert_equal(a.f_float[-1][0], 380.5)
    assert_equal(a.f_float[-1][-1], 456.5)
    assert_equal(a.d_double[-1][0], 380.25)
    assert_equal(a.d_double[-1][-1], 497.25)
Example #10
0
def test_vector():
    a = rnp.root2rec(load('hvector.root'))
    assert_equal(
        a.dtype,
        [('v_i', 'O'),
         ('v_f', 'O'),
         ('v_F', 'O'),
         ('v_d', 'O'),
         ('v_l', 'O'),
         ('v_c', 'O'),
         ('v_b', 'O')])

    assert_equal(a.v_i[1].dtype, np.int32)
    assert_equal(a.v_f[1].dtype, np.float32)
    assert_equal(a.v_F[1].dtype, np.float32)
    assert_equal(a.v_d[1].dtype, np.float64)
    assert_equal(a.v_l[1].dtype, np.int64)
    assert_equal(a.v_c[1].dtype, np.int8)
    assert_equal(a.v_b[1].dtype, np.bool)

    #check couple value
    assert_equal(a.v_i[1][0], 1)
    assert_equal(a.v_i[2][1], 3)
    assert_equal(a.v_i[-1][0], 99)
    assert_equal(a.v_i[-1][-1], 107)

    assert_equal(a.v_f[1][0], 2.0)
    assert_equal(a.v_f[2][1], 5.0)
    assert_equal(a.v_f[-1][0], 198.0)
    assert_equal(a.v_f[-1][-1], 206.0)

    assert_equal(a.v_F[1][0], 2.0)
    assert_equal(a.v_F[2][1], 5.0)
    assert_equal(a.v_F[-1][0], 198.0)
    assert_equal(a.v_F[-1][-1], 206.0)
Example #11
0
def test_variable_length_arrays():
    f = load(['vary1.root', 'vary2.root'])
    a = rnp.root2rec(f)
    assert_equal(
        a.dtype,
        [('len_n', '<i4'), ('len_f', '<i4'), ('len_d', '<i4'),
         ('n_char', 'O'), ('n_uchar', 'O'),
         ('n_short', 'O'), ('n_ushort', 'O'),
         ('n_int', 'O'), ('n_uint', 'O'),
         ('n_long', 'O'), ('n_ulong', 'O'),
         ('f_float', 'O'), ('d_double', 'O'),
         ('n2_int', 'O'), ('f2_float', 'O'), ('d2_double', 'O')])

    # check lengths
    for i in range(len(a)):
        assert_equal(a.len_n[i], len(a.n_int[i]))
        assert_equal(a.len_f[i], len(a.f_float[i]))
        assert_equal(a.len_d[i], len(a.d_double[i]))

        assert_equal((a.len_n[i], 2), a.n2_int[i].shape)
        assert_equal((a.len_f[i], 3), a.f2_float[i].shape)
        assert_equal((a.len_d[i], 4), a.d2_double[i].shape)

    # check elements
    assert_equal(a.len_n[0], 0)
    assert_equal(a.len_f[0], 1)
    assert_equal(a.len_d[0], 2)
    assert_equal(a.n_int[-1][-1], 417)
    assert_equal(a.f_float[-1][0], 380.5)
    assert_equal(a.f_float[-1][-1], 456.5)
    assert_equal(a.d_double[-1][0], 380.25)
    assert_equal(a.d_double[-1][-1], 497.25)
Example #12
0
def root2pandas(files_path, tree_name, **kwargs):
  '''
  Args:
  -----
  files_path: a string like './data/*.root', for example
  tree_name: a string like 'Collection_Tree' corresponding to the name of the folder inside the root 
             file that we want to open
  kwargs: arguments taken by root2array, such as branches to consider, start, stop, step, etc
  Returns:
  --------    
  output_panda: a pandas dataframe like allbkg_df in which all the info from the root file will be stored
  Note:
  -----
  if you are working with .root files that contain different branches, you might have to mask your data
  in that case, return pd.DataFrame(ss.data)
  '''
  # -- create list of .root files to process
  files = glob.glob(files_path)

  # -- process ntuples into rec arrays
  ss = stack_arrays([root2rec(fpath, tree_name, **kwargs) for fpath in files])

  try:
    return pd.DataFrame(ss)
  except Exception:
    return pd.DataFrame(ss.data)
Example #13
0
def test_vary():
    f = load(['vary1.root', 'vary2.root'])
    a = rnp.root2rec(f)
    assert_equal(
        a.dtype,
        [('len_n', '<i4'), ('len_f', '<i4'), ('len_d', '<i4'),
         ('n_char', 'O'), ('n_uchar', 'O'),
         ('n_short', 'O'), ('n_ushort', 'O'),
         ('n_int', 'O'), ('n_uint', 'O'),
         ('n_long', 'O'), ('n_ulong', 'O'),
         ('f_float', 'O'), ('d_double', 'O')])

    #check length
    for i in range(len(a)):
        assert_equal(a.len_n[i], len(a.n_int[i]))
        assert_equal(a.len_f[i], len(a.f_float[i]))
        assert_equal(a.len_d[i], len(a.d_double[i]))
    #couple element check
    assert_equal(a.len_n[0], 0)
    assert_equal(a.len_f[0], 1)
    assert_equal(a.len_d[0], 2)
    assert_equal(a.n_int[-1][-1], 417)
    assert_equal(a.f_float[-1][0], 380.5)
    assert_equal(a.f_float[-1][-1], 456.5)
    assert_equal(a.d_double[-1][0], 380.25)
    assert_equal(a.d_double[-1][-1], 497.25)
Example #14
0
def test_struct():
    assert_array_equal(rnp.root2rec(load('struct.root')),
        np.array([(10, 15.5, 20, 781.2)],
            dtype=[
                ('branch1_intleaf', '<i4'),
                ('branch1_floatleaf', '<f4'),
                ('branch2_intleaf', '<i4'),
                ('branch2_floatleaf', '<f4')]))
Example #15
0
def test_object_expression():
    rec = rnp.root2rec(load(['object1.root', 'object2.root']),
                       branches=['vect.Pt()'])
    assert_array_equal(
        rec['vect.Pt()'],
        np.concatenate([
            np.arange(10, dtype='d') + 1,
            np.arange(10, dtype='d') + 2]))
Example #16
0
def test_object_expression():
    rec = rnp.root2rec(load(['object1.root', 'object2.root']),
                       branches=['vect.Pt()'])
    assert_array_equal(
        rec['vect.Pt()'],
        np.concatenate([
            np.arange(10, dtype='d') + 1,
            np.arange(10, dtype='d') + 2]))
Example #17
0
def test_struct():
    assert_array_equal(rnp.root2rec(load('struct.root')),
        np.array([(10, 15.5, 20, 781.2)],
            dtype=[
                ('branch1_intleaf', '<i4'),
                ('branch1_floatleaf', '<f4'),
                ('branch2_intleaf', '<i4'),
                ('branch2_floatleaf', '<f4')]))
Example #18
0
def harvest(filenames,definitions,**kwargs):
    """
    Extract the variable data from the provided files

    Args:
        filenames (list): the files to extract from
                          currently supported: {0}

    Keyword Args:
        transformation (func): will be applied to the read out data

    Returns:
        pd.Series or pd.DataFrame
    """.format(REGISTERED_FILEEXTENSIONS.__repr__())

    data = pd.Series()
    for filename in filenames:
        filetype = f.strip_all_endings(filename)[1]
        assert filetype in REGISTERED_FILEEXTENSIONS, "Filetype {} not known!".format(filetype)
        assert os.path.exists(filename), "File {} does not exist!".format(filetype)
        Logger.debug("Attempting to harvest {1} file {0}".format(filename,filetype))
        
        if filetype == ".h5" and not isinstance(filename, tables.table.Table):
            # store = pd.HDFStore(filename)
            hdftable = tables.openFile(filename)

        else:
            hdftable = filename

        tmpdata = pd.Series()
        for definition in definitions:
            if filetype == ".h5":
                try:
                    # data = store.select_column(*definition)
                    tmpdata = hdftable.getNode("/" + definition[0]).col(definition[1])
                    tmpdata = pd.Series(tmpdata, dtype=n.float64)
                    Logger.debug("Found {} entries in table for {}{}".format(len(tmpdata),definition[0],definition[1]))
                    break
                except tables.NoSuchNodeError:
                    Logger.debug("Can not find definition {0} in {1}! ".format(definition, filename))
                    continue

            elif filetype == ".root":
                tmpdata = rn.root2rec(filename, *definition)
                tmpdata = pd.Series(data)
        if filetype == ".h5":
            hdftable.close()

        #tmpdata = harvest_single_file(filename, filetype,definitions)
        # self.data = self.data.append(data.map(self.transform))
        # concat should be much faster
        if "transformation" in kwargs:
            transform = kwargs['transformation']
            data = pd.concat([data, tmpdata.map(transform)])
        else:
            data = pd.concat([data, tmpdata])
        del tmpdata
    return data
Example #19
0
def getRootToRec(filename, treename):
    """
        Convert and return a tree into a record numpy array
        Inputs: filename and treename
        Return: rec array
    """
    from root_numpy import root2array, root2rec
    info('(getRootToRec) building rec array from tree %s in file %s' % (treename, filename))
    return root2rec(filename, treename)
Example #20
0
def run_code(offline_jetpT_threshold = 0., gTower_jetET_threshold = 0., seed_ETthresh = 0.):
  #set seed cuts
  seed_filter = gTowers.SeedFilter(ETthresh = seed_ETthresh, numSeeds = 1)

  leading_trigger_jets = []

  #column names to pull from the file, must be in this order to sync with the predefined classes in atlas_jets package
  offline_column_names = ['jet_AntiKt10LCTopo_%s' % col for col in ['E', 'pt', 'm', 'eta', 'phi']]
  gTower_column_names = ['gTower%s' % col for col in ['E', 'NCells', 'EtaMin', 'EtaMax', 'PhiMin', 'PhiMax']]

  #bins for all histograms
  num_offlineEvents = 0

  # main loop that goes over the file
  for event_num in range(total_num_events):
    if event_num % 100 == 0:
      print "doing event_num=%d for (%d, %d, %d)" % (event_num, offline_jetpT_threshold, gTower_jetET_threshold, seed_ETthresh)
    # pull in data row by row
    data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1))
    oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names])

    # if there are no offline jets, we skip it
    if len(oEvent.jets) == 0 or oEvent.jets[0].pT < offline_jetpT_threshold:
      continue
    num_offlineEvents += 1

    '''can use seed_filter on an event by event basis'''
    # max number of seeds based on number of offline jets
    #seed_filter = gTowers.SeedFilter(numSeeds = len(oEvent.jets))
    tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter)

    tEvent.get_event()
    leading_trigger_jets = leading_trigger_jets + tEvent.event.jets


  '''at this point, we've processed all the data and we just need to make plots'''

  bins_leading_trigger_jets  = np.arange(0.,4000.,5.)
  hist_leading_trigger_jets  = np.histogram([jet.E/np.cosh(jet.eta) for jet in leading_trigger_jets], bins=bins_leading_trigger_jets)[0]
  # first get the widths of the bins when we make the plots
  width_leading_trigger_jets = np.array([x - bins_leading_trigger_jets[i-1] for i,x in enumerate(bins_leading_trigger_jets)][1:])

  filename_ending = 'offline%d_gTower%d_seed%d_unweighted' % (offline_jetpT_threshold, gTower_jetET_threshold, seed_filter.ETthresh)

  #make figures
  '''Leading Trigger Jets Histogram'''
  pl.figure()
  pl.xlabel('$E_T^{\mathrm{jet}}$ [GeV]')
  pl.ylabel('Number of leading trigger jets')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.bar(bins_leading_trigger_jets[:-1], hist_leading_trigger_jets, width=width_leading_trigger_jets)
  pl_lJet = {'bins': bins_leading_trigger_jets,\
             'values': hist_leading_trigger_jets,\
             'width': width_leading_trigger_jets}
  pickle.dump(pl_lJet, file('events_histogram_leading_trigger_jets_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_histogram_leading_trigger_jets_%s.png' % filename_ending)
  pl.close()
Example #21
0
def readFiles():
    print 'Reading files...'

    # weightsS = root2rec(files_signal, treename='tree', branches=['weight'], selection=selection)
    weights_sig = root2rec(files_sig, treename='tree', branches=['full_weight'], selection=selection)['full_weight']
    weights_ZTT = root2rec(files_ZTT, treename='tree', branches=['full_weight'], selection=selection)['full_weight']
    weightsB = root2rec(files_bg, treename='tree', branches=['full_weight'], selection=selection)['full_weight']

    sum_weights_sig = np.sum(weights_sig)
    sum_weights_ZTT = np.sum(weights_ZTT)
    sum_weightsB = np.sum(weightsB)

    normWeights_ZTT = weights_ZTT * sum_weights_sig/sum_weights_ZTT
    normWeightsB = weightsB * sum_weights_sig/sum_weightsB

    # nS = len(weightsS)
    n_sig = len(weights_sig)
    n_ZTT = len(weights_ZTT)
    nB = len(weightsB)
    
    # fullWeight = np.concatenate((weightsS, weightsB))
    fullWeight = np.concatenate((weights_sig, normWeights_ZTT, normWeightsB))
    unNormFullWeight = np.concatenate((weights_sig, weights_ZTT, weightsB))
    # fullWeight = fullWeight['weight']
    
    # fullWeight = np.ones(len(fullWeight))
    
    # del weightsS, weightsB
    
    # arrSB = root2array(files_signal + files_bg, treename='tree', branches=trainVars(), selection=selection)
    arrSB = root2array(files_sig + files_ZTT + files_bg, treename='tree', branches=trainVars(), selection=selection)
    
    # Need a matrix-like array instead of a 1-D array of lists for sklearn
    arrSB = (np.asarray([arrSB[var] for var in trainVars()])).transpose()
    
    # targets = np.concatenate((np.ones(nS),np.zeros(nB)))
    # targets = np.concatenate((np.ones(n_sig)*2, np.ones(n_ZTT),np.zeros(nB)))
    targets = np.concatenate((np.ones(n_sig)*vals[0], np.ones(n_ZTT)*vals[1], np.ones(nB)*vals[2]))
    
    print 'Done reading files.'
    
    #import pdb; pdb.set_trace()
    
    return arrSB, fullWeight, unNormFullWeight, targets
Example #22
0
File: knn.py Project: mackaiver/smd
def main():
    filename = '../Blatt7.root'

    signal_size = 10000
    background_size = 20000

    print("Reading Data from file " + filename)
    background = root2rec(filename, 'Untergrund_MC', branches=['AnzahlHits', 'x', 'y'])
    signal = root2rec(filename, 'Signal_MC_Akzeptanz', branches=['AnzahlHits', 'x', 'y'])

    background = np.asarray([background['AnzahlHits'], background['x'], background['y']]).T
    signal = np.asarray([signal['AnzahlHits'], signal['x'], signal['y']]).T



    training = np.append(signal[:5000],background[:5000], axis=0)
    label =  np.append(np.ones(5000,  dtype=np.int), np.zeros(5000,  dtype=np.int), axis=0)

    test_data = np.append(signal[:signal_size],background[:background_size], axis=0)
    test_label =  np.append(np.ones(signal_size,  dtype=np.int), np.zeros(background_size,  dtype=np.int), axis=0)


    print("Creating KD-Tree")
    kd = KDTree(training, leafsize=20)
    print("Starting prediction with k = 10")
    prediction = knn(data=test_data, label=label, tree=kd, k=10)
    performance(label=test_label, prediction=prediction)

    print("Starting prediction with k = 20")
    prediction = knn(data=test_data, label=label, tree=kd, k=20)
    performance(label=test_label, prediction=prediction)


    print("Using log(AnzahlHits)")
    #new training and test data for log10(AnzahlHits)
    background[:,0] = np.log10(background[:,0])
    signal[:,0] = np.log10(signal[:,0])
    training = np.append(signal[:5000], background[:5000], axis=0)
    test_data = np.append(signal[:signal_size], background[:background_size], axis=0)
    print("Creating KD-Tree")
    kd = KDTree(training, leafsize=20)
    print("Starting prediction")
    prediction = knn(data=test_data, label=label, tree=kd, k=10)
    performance(label=test_label, prediction=prediction)
Example #23
0
def test_string():
    a = rnp.root2rec(load('string.root'))
    types = [
        ('message', 'O'),
        ('vect', 'O'),
        ('vect2d', 'O'),
    ]
    assert_equal(a.dtype, types)
    assert_equal(a[0][0], 'Hello World!')
    assert_equal(a[0][1][0], 'Hello!')
    assert_equal(a[0][2][0][0], 'Hello!')
Example #24
0
def test_string():
    a = rnp.root2rec(load('string.root'))
    types = [
        ('message', 'O'),
        ('vect', 'O'),
        ('vect2d', 'O'),
    ]
    assert_equal(a.dtype, types)
    assert_equal(a[0][0], 'Hello World!')
    assert_equal(a[0][1][0], 'Hello!')
    assert_equal(a[0][2][0][0], 'Hello!')
Example #25
0
def test_stack():
    rec = rnp.root2rec(load('test.root'))
    s = rnp.stack([rec, rec])
    assert_equal(s.shape[0], 2 * rec.shape[0])
    assert_equal(s.dtype.names, rec.dtype.names)
    s = rnp.stack([rec, rec], fields=['x', 'y'])
    assert_equal(s.shape[0], 2 * rec.shape[0])
    assert_equal(s.dtype.names, ('x', 'y'))
    # recs don't have identical fields
    rec2 = recfunctions.drop_fields(rec, ['i', 'x'])
    s = rnp.stack([rec, rec2])
    assert_equal(set(s.dtype.names), set(['y', 'z']))
Example #26
0
def getvars(vars, flav, filename, ptmin=20, ptmax=200, etamin=0., etamax=2.1):

    leaves_train = [flav + '_' + var for var in vars]
    leaves = leaves_train + [flav + '_pt', flav + '_eta']
    array = root2rec(filename, 'tree', leaves)

    xx = np.vstack([array[leave] for leave in leaves_train]).T[1::2]
    pt = array[flav + '_pt'][1::2]
    eta = array[flav + '_eta'][1::2]

    return xx[(pt > ptmin) & (pt < ptmax) & (np.fabs(eta) > etamin) &
              (np.fabs(eta) < etamax)]
Example #27
0
def test_stack():
    rec = rnp.root2rec(load('test.root'))
    s = rnp.stack([rec, rec])
    assert_equal(s.shape[0], 2 * rec.shape[0])
    assert_equal(s.dtype.names, rec.dtype.names)
    s = rnp.stack([rec, rec], fields=['x', 'y'])
    assert_equal(s.shape[0], 2 * rec.shape[0])
    assert_equal(s.dtype.names, ('x', 'y'))
    # recs don't have identical fields
    rec2 = recfunctions.drop_fields(rec, ['i', 'x'])
    s = rnp.stack([rec, rec2])
    assert_equal(set(s.dtype.names), set(['y', 'z']))
Example #28
0
def run_code(event_num):
  data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1))
  oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names])
  tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter)
  tEvent.get_event()

  grid_towers = gTowers.Grid(cell_resolution=0.02, domain=domain)
  grid_towers.add_tower_event(tEvent)
  grid_towers.save(title='Event %d, gTowers, cell resolution=0.02' % event_num, filename='event_%d_towers.png' % event_num, colzLabel = '$E_T^{\mathrm{tower}}$')

  grid_offline = gTowers.Grid(cell_resolution=0.02, recon_algo = 'gaussian', domain=domain)
  grid_offline.add_event(oEvent)
  grid_offline.save(title='Event %d, offline jets, cell resolution=0.02' % event_num, filename='event_%d_offline_jets.png' % event_num, colzLabel = '$p_T^{\mathrm{jet}}$')

  grid_trigger = gTowers.Grid(cell_resolution=0.02, recon_algo = 'gaussian', domain=domain)
  grid_trigger.add_event(tEvent.get_event())
  grid_trigger.save(title='Event %d, trigger jets, cell resolution=0.02' % event_num, filename='event_%d_trigger_jets.png' % event_num, colzLabel = '$E_T^{\mathrm{jet}}$')
Example #29
0
    def harvest_from_rootfile(self,rootfile,definition):
        """
        Get data from a root file

        Args:
            rootfile (str): Name of the *.root file
            definition (tuple): Name of branches/leaves in the rootfile
        Returns:
            pd.Series or DataFrame
        """
        #FIXME: What happens if it is not found in the rootfile

        data = rn.root2rec(rootfile,*definition)
        if self.defsize == 2:
            data = pd.Series(data)
        elif self.defsize == 1:
            data = pd.DataFrame(data)
        else:
            raise ValueError
        return data
Example #30
0
def getjetvar(jet,
              var,
              filename,
              ptmin=20,
              ptmax=200,
              etamin=0.,
              etamax=2.1,
              nocut=True):

    leaves = [jet + 'pt', jet + 'eta']
    if var not in ['pt', 'eta']: leaves += [jet + var]
    array = root2rec(filename, 'tree', leaves)

    vars = array[jet + var]
    pt = array[jet + 'pt']
    eta = array[jet + 'eta']

    if not nocut:
        vars = vars[(pt > ptmin) & (pt < ptmax) & (np.fabs(eta) > etamin) &
                    (np.fabs(eta) < etamax)]
    return vars
Example #31
0
def test_vary():
    f = load(['vary1.root', 'vary2.root'])
    a = rnp.root2rec(f)
    assert_equal(
        a.dtype,
        [('len_n', '<i4'), ('len_f', '<i4'), ('len_d', '<i4'),
            ('n_int', 'O'), ('f_float', 'O'), ('d_double', 'O')])
    #check length
    for i in range(len(a)):
        assert_equal(a.len_n[i], len(a.n_int[i]))
        assert_equal(a.len_f[i], len(a.f_float[i]))
        assert_equal(a.len_d[i], len(a.d_double[i]))
    #couple element check
    assert_equal(a.len_n[0], 0)
    assert_equal(a.len_f[0], 1)
    assert_equal(a.len_d[0], 2)
    assert_equal(a.n_int[-1][-1], 417)
    assert_equal(a.f_float[-1][0], 380.5)
    assert_equal(a.f_float[-1][-1], 456.5)
    assert_equal(a.d_double[-1][0], 380.25)
    assert_equal(a.d_double[-1][-1], 497.25)
Example #32
0
def gettracks(vars,
              flav,
              filename,
              ptmin=20,
              ptmax=200,
              etamin=0.,
              etamax=2.1,
              train=1):

    leaves_train = [flav + '_trk' + var for var in vars]
    leaves = leaves_train + [flav + '_pt', flav + '_eta']
    array = root2rec(filename, 'tree', leaves)

    xx = np.array([
        zerofill(array[leave].tolist()).T for leave in leaves_train
    ]).T[train::2]
    pt = array[flav + '_pt'][train::2]
    eta = array[flav + '_eta'][train::2]

    return xx[(pt > ptmin) & (pt < ptmax) & (np.fabs(eta) > etamin) &
              (np.fabs(eta) < etamax)]
Example #33
0
def getvar(var,
           flav,
           filename,
           reco=False,
           ptmin=80,
           ptmax=110,
           etamin=0.,
           etamax=2.1,
           train=0):

    varflav = flav
    if reco: varflav += 'reco'

    leaves = [flav + '_pt', flav + '_eta']
    if var not in ['pt', 'eta']: leaves += [varflav + '_' + var]
    array = root2rec(filename, 'tree', leaves)

    vars = array[varflav + '_' + var][train::2]
    pt = array[flav + '_pt'][train::2]
    eta = array[flav + '_eta'][train::2]

    return vars[(pt > ptmin) & (pt < ptmax) & (np.fabs(eta) > etamin) &
                (np.fabs(eta) < etamax)]
Example #34
0
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib

#load data
op = optparse.OptionParser(usage=__doc__)
op.add_option("--treeplot", dest="TREEPLOT", default=False, action="store_true", help="Make a decision tree classifier plot")
op.add_option("--bounds", dest="BOUNDS", default=False, action="store_true", help="Plot decision tree boundaries in two input variables")

opts, args = op.parse_args()

infname_sig, infname_bkg = args[0], args[1]

#set up DataFrames
df_sig = pandas.DataFrame(root_numpy.root2rec(infname_sig, branches=["tau2_sd","tau3_sd","softdropjet.M()","fatjet.M()"]))
df_bkg = pandas.DataFrame(root_numpy.root2rec(infname_bkg, branches=["tau2_sd","tau3_sd","softdropjet.M()","fatjet.M()"]))

df_sig["is_signal"] = 1
df_bkg["is_signal"] = 0

df = pandas.concat([df_sig, df_bkg], ignore_index=True)
df = df.iloc[np.random.permutation(len(df))]

df_train = df[0:150000]
df_test_orig = df[150000:]

df_sig = np.asarray(df_sig)
df_bkg = np.asarray(df_bkg)
df_train = np.asarray(df_train)
df_test = np.asarray(df_test_orig)
Example #35
0
from root_numpy import root2rec
import numpy as np
import pylab as pl
import pickle

# read the sample
sample = root2rec('sample.root')
y = sample['label']
X = np.vstack([sample[var] for var in ['a', 'b']]).T

with open('sklearn_bdt.pickle', 'r') as f:
    bdt = pickle.load(f)

plot_colors = "br"
plot_step = 0.02
class_names = "AB"

pl.figure(figsize=(10, 5))

# Plot the decision boundaries
pl.subplot(121)
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                     np.arange(y_min, y_max, plot_step))

Z = bdt.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
cs = pl.contourf(xx, yy, Z, cmap=pl.cm.Paired)
pl.axis("tight")
Example #36
0
plik=str(sys.argv[1]);

try:
    os.mkdir("./rysunki/"+plik)
except:
    print "Jedziemyyyy...."

#z reki
#pliki=["ot001", "ot002", "ot009", "ot019", "ot020" , "nt001"]; 

#z konsoli
#pliki = [];
#for i in xrange(1, len(sys.argv)):
#     pliki += [str(sys.argv[i])]

rec = root2rec(plik+'.root', "tvec")


'''
x=[];z=[];
for i in xrange(0, len(rec)):
    if (math.fabs(rec.Teta[i]) < 1.61): 
        x.append(rec.Tbeta[i]);
        if (rec.Tl1[i]):
            z.append(rec.Tbeta[i])
'''

x = np.extract(np.absolute(rec.Teta) < 1.61, rec.Tbeta)
z = np.extract(rec.Tl1, rec.Tbeta)

def run_code(offline_jetpT_threshold = 0., gTower_jetET_threshold = 0., seed_ETthresh = 0.):
  #set seed cuts
  seed_filter = gTowers.SeedFilter(ETthresh = seed_ETthresh, numSeeds = 1.0e5)

  #column names to pull from the file, must be in this order to sync with the predefined classes in atlas_jets package
  offline_column_names = ['jet_AntiKt10LCTopo_%s' % col for col in ['E', 'pt', 'm', 'eta', 'phi']]
  gTower_column_names = ['gTower%s' % col for col in ['E', 'NCells', 'EtaMin', 'EtaMax', 'PhiMin', 'PhiMax']]

  #bins for all histograms
  bins_towerMultiplicity = np.arange(0, 1000, 5).astype(float)
  bins_towerHistogram    = np.array([0,50,100,150,200,250,300,350,400,500,750,1000,4000]).astype(float)
  bins_efficiency        = np.arange(0,1240, 20).astype(float)

  hist_towerMultiplicity = np.zeros(len(bins_towerMultiplicity)-1).astype(float)
  hist_towerHistogram    = np.zeros(len(bins_towerHistogram)-1).astype(float)
  hist_efficiency_num    = np.zeros(len(bins_efficiency)-1).astype(float)
  hist_efficiency_den    = np.zeros(len(bins_efficiency)-1).astype(float)

  num_offlineEvents = 0

  # main loop that goes over the file
  for event_num in range(total_num_events):
    if event_num % 100 == 0:
      print "doing event_num=%d for (%d, %d, %d)" % (event_num, offline_jetpT_threshold, gTower_jetET_threshold, seed_ETthresh)
    # pull in data row by row
    data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1))
    oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names])

    # if there are no offline jets, we skip it
    if len(oEvent.jets) == 0 or oEvent.jets[0].pT < offline_jetpT_threshold:
      continue
    num_offlineEvents += 1

    '''can use seed_filter on an event by event basis'''
    # max number of seeds based on number of offline jets
    #seed_filter = gTowers.SeedFilter(numSeeds = len(oEvent.jets))
    tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter)
    # build up the first two histograms using just the gTower data
    # note, we have np.histogram(...)[0] since we only need the hist data
    tower_ETs = [tower.E/np.cosh(tower.eta) for tower in tEvent.towers]
    hist_towerMultiplicity += np.cumsum(np.histogram(tower_ETs, bins=bins_towerMultiplicity)[0][::-1])[::-1] #this makes a reverse cumulative sum
    hist_towerHistogram += np.histogram(tower_ETs, bins=bins_towerHistogram)[0]

    tEvent.get_event()
    #paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.filter_towers())
    paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.event.jets)
    paired_data = np.array([[oJet.pT, tJet.E/np.cosh(tJet.eta)] for oJet,tJet in paired_jets if oJet.pT > offline_jetpT_threshold])
    # build up the turn on curve histograms
    hist_efficiency_den += np.histogram(paired_data[:,0], bins=bins_efficiency)[0]
    hist_efficiency_num += np.histogram(paired_data[np.where(paired_data[:,1] > gTower_jetET_threshold),0], bins=bins_efficiency)[0]


  '''at this point, we've processed all the data and we just need to make plots'''

  # first get the widths of the bins when we make the plots
  width_towerMultiplicity = np.array([x - bins_towerMultiplicity[i-1] for i,x in enumerate(bins_towerMultiplicity)][1:])
  width_towerHistogram    = np.array([x - bins_towerHistogram[i-1] for i,x in enumerate(bins_towerHistogram)][1:])
  width_efficiency        = np.array([x - bins_efficiency[i-1] for i,x in enumerate(bins_efficiency)][1:])

  # rescale tower data to define it per event
  hist_towerMultiplicity = 1.0*hist_towerMultiplicity/num_offlineEvents
  hist_towerHistogram    = 1.0*hist_towerHistogram/num_offlineEvents

  #histogram y-range
  hist_ylim = (10.**-3., 10.**4.)

  filename_ending = 'offline%d_gTower%d_seed%d_unweighted' % (offline_jetpT_threshold, gTower_jetET_threshold, seed_filter.ETthresh)

  #make figures
  '''Tower Multiplicity'''
  pl.figure()
  pl.xlabel('$E_T^{\mathrm{threshold}}$ [GeV]')
  pl.ylabel('Number of gTowers per event')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.bar(bins_towerMultiplicity[:-1], hist_towerMultiplicity, width=width_towerMultiplicity, log=True)
  pl.ylim(hist_ylim)
  pl_tMult = {'bins': bins_towerMultiplicity,\
              'values': hist_towerMultiplicity,\
              'width': width_towerMultiplicity}
  pickle.dump(pl_tMult, file('events_threshold_histogram_multiplicity_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_threshold_histogram_multiplicity_%s.png' % filename_ending)
  pl.close()

  '''Tower Histogram per Event'''
  pl.figure()
  pl.xlabel('$p_T^{\mathrm{jet}}$ [GeV]')
  pl.ylabel('Number of gTowers per event')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.bar(bins_towerHistogram[:-1], hist_towerHistogram, width=width_towerHistogram, log=True)
  pl.ylim(hist_ylim)
  pl_tHist = {'bins': bins_towerHistogram,\
              'values': hist_towerHistogram,\
              'width': width_towerHistogram}
  pickle.dump(pl_tHist, file('events_threshold_histogram_towers_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_threshold_histogram_towers_%s.png' % filename_ending)
  pl.close()

  xlim_efficiency = (0.0,1.0)
  ylim_efficiency = (0.0,1.0)

  '''Turn on curves'''
  pl.figure()
  pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]')
  pl.ylabel('Turn-On Curve Denominator')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.bar(bins_efficiency[:-1], hist_efficiency_den, width=width_efficiency)
  xlim_efficiency = pl.xlim()
  xlim_efficiency = (0.0, xlim_efficiency[1])
  pl.xlim(xlim_efficiency)
  ylim_efficiency = pl.ylim()
  ylim_efficiency = (0.0, ylim_efficiency[1])
  pl.ylim(ylim_efficiency)
  pl_turnon_den = {'bins': bins_efficiency,\
                   'values': hist_efficiency_den,\
                   'width': width_efficiency}
  pickle.dump(pl_turnon_den, file('events_turnon_denominator_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_turnon_denominator_%s.png' % filename_ending)
  pl.close()

  pl.figure()
  pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]')
  pl.ylabel('Turn-On Curve Numerator')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.bar(bins_efficiency[:-1], hist_efficiency_num, width=width_efficiency)
  pl.xlim(xlim_efficiency)
  pl.ylim(ylim_efficiency)
  pl_turnon_num = {'bins': bins_efficiency,\
                   'values': hist_efficiency_num,\
                   'width': width_efficiency}
  pickle.dump(pl_turnon_num, file('events_turnon_numerator_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_turnon_numerator_%s.png' % filename_ending)
  pl.close()

  nonzero_bins = np.where(hist_efficiency_den != 0)
  #compute integral and differential curves
  hist_efficiency_curve_differential = np.true_divide(hist_efficiency_num[nonzero_bins], hist_efficiency_den[nonzero_bins])
  hist_efficiency_curve_integral = np.true_divide(np.cumsum(hist_efficiency_num[nonzero_bins][::-1])[::-1], np.cumsum(hist_efficiency_den[nonzero_bins][::-1])[::-1])
  #get halfway in between really
  xpoints_efficiency = bins_efficiency[:-1] + width_efficiency/2.

  def binomial_errors(hist_ratio, hist_one, hist_two):
    errors = []
    for w, num, den in zip(hist_ratio, hist_one, hist_two):
      # root.cern.ch/root/html/src/TH1.cxx.html#l5.yxD
      # formula cited (for histograms [num, den] with no errors) is:
      #     w = num/den
      #     if w = 1:
      #             sigma = 0
      #     else:
      #             sigma = abs( (1 - 2*w + w**2) / den**2 )
      if w == 1.0:
        errors.append(0.0)
      else:
        errors.append( (np.abs( (1.-2.*w + w**2.)/den**2.))**0.5 )
    return errors

  #binomial errors s^2 = n * p * q
  errors_efficiency_differential = binomial_errors(hist_efficiency_curve_differential, hist_efficiency_num[nonzero_bins], hist_efficiency_den[nonzero_bins])
  errors_efficiency_integral     = binomial_errors(hist_efficiency_curve_integral, np.cumsum(hist_efficiency_num[nonzero_bins][::-1])[::-1], np.cumsum(hist_efficiency_den[nonzero_bins][::-1])[::-1])

  pl.figure()
  pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]')
  pl.ylabel('Trigger Efficiency - Differential')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.errorbar(xpoints_efficiency[nonzero_bins], hist_efficiency_curve_differential, yerr=errors_efficiency_differential, ecolor='black')
  pl.xlim(xlim_efficiency)
  pl.ylim((0.0,1.2))
  pl.grid(True)
  pl_eff_diff = {'xdata': xpoints_efficiency,\
                 'ydata': hist_efficiency_curve_differential,\
                 'xerr' : 1.0,\
                 'yerr' : errors_efficiency_differential,\
                 'num'  : hist_efficiency_num,\
                 'den'  : hist_efficiency_den,\
                 'bins' : bins_efficiency,\
                 'nonzero_bins': nonzero_bins}
  pickle.dump(pl_eff_diff, file('events_turnon_curve_differential_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_turnon_curve_differential_%s.png' % filename_ending)
  pl.close()

  pl.figure()
  pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]')
  pl.ylabel('Trigger Efficiency - Integral')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.errorbar(xpoints_efficiency[nonzero_bins], hist_efficiency_curve_integral, yerr=errors_efficiency_integral, ecolor='black')
  pl.xlim(xlim_efficiency)
  pl.ylim((0.0,1.2))
  pl.grid(True)
  pl_eff_int = {'xdata': xpoints_efficiency,\
                'ydata': hist_efficiency_curve_integral,\
                'xerr' : 1.0,\
                'yerr' : errors_efficiency_integral,\
                'num'  : hist_efficiency_num,\
                'den'  : hist_efficiency_den,\
                'bins' : bins_efficiency,\
                'nonzero_bins': nonzero_bins}
  pickle.dump(pl_eff_int, file('events_turnon_curve_integral_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_turnon_curve_integral_%s.png' % filename_ending)
  pl.close()
Example #38
0
try:
    os.mkdir("./rysunki/")
except:
    print "Jedziemyyyy...."

#z reki
#pliki=["ot001", "ot002", "ot009", "ot019", "ot020" , "nt001"]; 



PtBins=[0., 0.1,1.5, 2., 2.5, 3., 3.5, 4., 4.5, 5., 6., 7., 8.,10., 12., 14., 16., 18., 20., 25., 30., 35., 40., 45.,50., 60., 70., 80., 90., 100., 120., 140.,160.];

PtWidth=[(PtBins[j+1]-PtBins[j]) for j in range(len(PtBins)-1)]

bin_width = 0.01;
rec = root2rec(sys.argv[1], "tvec")
       
#x = np.extract(np.absolute(rec.Teta) < 1.61, rec.Tbeta)
x = np.extract(np.logical_and(np.logical_and(np.absolute(rec.Teta0) < 1.61, rec.Tlbx_1), rec.Tpt0 > 10), rec.Teta0);
y = np.extract(np.logical_and(np.logical_and(np.absolute(rec.Teta0) < 1.61, rec.Tlbx_1), rec.Tpt0 > 10), rec.Tphi0);


print  len(x), len(y)

pl.hist2d(x, y, bins=200, norm=LogNorm())
#pl.hist2d(x, y, bins=np.arange(0.,4,bin_width) , norm=LogNorm())
pl.colorbar()
#plt.xscale('log')
plt.ylabel(r'$\phi$')
plt.xlabel(r'$\eta$')
plt.draw()
Example #39
0
#training_files = ['folds/'+f for f in os.listdir('folds') if f.find('train')!=-1]
#training_files = ['/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_jz6_nTrk_v1_1300_1800_mw_merged.root']
training_files = ['/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_nonTrk_v3_400_1200_mw_merged.root']
#training_files = ['/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_jz5_nTrk_v1_800_1200_mw_merged.root']
cols = np.linspace(1,42,42,dtype=int)
#filename = '/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_jz6_nTrk_v1_1300_1800_mw_merged.csv'
filename = '/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_nonTrk_v3_400_1200_mw_merged.csv'
#filename = '/Disk/ds-sopa-group/PPE/atlas/users/tibristo/BosonTagging/csv/AntiKt10LCTopoTrimmedPtFrac5SmallR20_13tev_mc15_jz5_nTrk_v1_800_1200_mw_merged.csv'

for f in training_files:
    #tfile = rt.TFile.Open(f)
    #tree = tfile.Get('outputTree')
    # get the branches and only focus on the ones not listed below
    # remove the variables that are "observers", ie that do not get used for training, or weights, since those must not be scaled.
    #br = tree.GetListOfBranches()
    X = rn.root2rec(f)
    variables = list(X.dtype.names)
    
    #for b in br:
    #    variables.append(br.GetName())
    variables.remove('label')
    observers = ['mc_event_weight','jet_antikt10truthtrimmedptfrac5smallr20_pt','jet_antikt10truthtrimmedptfrac5smallr20_eta','m','pt','eta','phi','evt_xsec','evt_filtereff','evt_nevts','weight','jet_camkt12truth_pt','jet_camkt12truth_eta','jet_camkt12truth_phi','jet_camkt12truth_m','jet_camkt12lctopo_pt','jet_camkt12lctopo_eta','jet_camkt12lctopo_phi','jet_camkt12lctopo_m','eff','averageintperxing']
    for o in observers:
        if o in variables:
            variables.remove(o)
    curr_means = np.zeros(len(variables))
    curr_std = np.ones(len(variables))
    weighted_means = np.zeros(len(variables))
    weighted_std = np.ones(len(variables))
    for j,v in enumerate(variables):
        mean = np.mean(X[v])
Example #40
0
q_eff = [0.69, 0.78, 0.82, 0.85, 0.85, 0.86, 0.85, 0.84, 0.81, 0.75, 0.65, 0.50]
# Convert eV to MeV for quantum efficiency
for i in range(len(e_ph)):
    e_ph[i] /= 1000000
# Determine the maximum and minimum energies - this determines the
# interpolation range.
interp_min = e_ph[0]
interp_max = e_ph[-1]
# Interpolate to get a function for quantum efficiency in terms of photon
# energy.
q_eff_fn = interpolate.UnivariateSpline(e_ph, q_eff)

# <codecell>

# Get data
data = root2rec(data_file, treename=treename_str)
# Get output data file
f = open(proc_data_file, "w")

# <codecell>

iEvent = 0
# Arrays of processed number of hits and energy deposit
n_hits_proc = []
energy_proc = []
n_hits_proc_uncut = []
energy_proc_uncut = []
# Loop through the energy deposit arrays for each event
for event_energy in data.energy:
    # Number of hits and energy detected in a single event
    n_hits_registered = 0
Example #41
0
def test_specific_branch():
    a = rnp.root2rec(load('single1.root'), branches=['f_float'])
    assert_equal(a.dtype, [('f_float', '<f4')])
def run_code(offline_jetpT_threshold = 0., gTower_jetET_threshold = 0., seed_ETthresh = 0.):
  #set seed cuts
  seed_filter = gTowers.SeedFilter(ETthresh = seed_ETthresh, numSeeds = 1.0e5)

  #column names to pull from the file, must be in this order to sync with the predefined classes in atlas_jets package
  offline_column_names = ['jet_AntiKt10LCTopo_%s' % col for col in ['E', 'pt', 'm', 'eta', 'phi']]
  gTower_column_names = ['gTower%s' % col for col in ['E', 'NCells', 'EtaMin', 'EtaMax', 'PhiMin', 'PhiMax']]

  matched_jet_pairs = []

  num_offlineEvents = 0

  # main loop that goes over the file
  for event_num in range(total_num_events):
    if event_num % 100 == 0 and event_num != 0:
      print "doing event_num=%d for (%d, %d, %d)" % (event_num, offline_jetpT_threshold, gTower_jetET_threshold, seed_ETthresh)
    # pull in data row by row
    data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1))
    oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names])

    # if there are no offline jets, we skip it
    if len(oEvent.jets) == 0 or oEvent.jets[0].pT < offline_jetpT_threshold:
      continue
    num_offlineEvents += 1

    '''can use seed_filter on an event by event basis'''
    # max number of seeds based on number of offline jets
    #seed_filter = gTowers.SeedFilter(numSeeds = len(oEvent.jets))
    tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter)
    tEvent.get_event()

    #paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.filter_towers())
    paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.event.jets)
    matched_jet_pairs.append(np.array([[oJet.pT, tJet.E/np.cosh(tJet.eta)] for oJet,tJet in paired_jets if oJet.pT > offline_jetpT_threshold and tJet.E > 0.]))
  '''at this point, we've processed all the data and we just need to make plots'''

  filename_ending = 'offline%d_gTower%d_seed%d_unweighted' % (offline_jetpT_threshold, gTower_jetET_threshold, seed_filter.ETthresh)

  matched_jet_pairs = np.array(matched_jet_pairs)
  all_jet_pairs = np.array([l for item in matched_jet_pairs for l in item])

  leading_offline_jet_pairs = np.array([l for item in matched_jet_pairs for l in item if l[1] == np.amax(item[:,1])])

  xlim = (1e2,5e2)
  ylim = (0.,1200.)

  #make figures
  '''All Jet Pairs'''
  pl.figure()
  pl.xlabel('offline $p_T^{\mathrm{jet}}$ [GeV]')
  pl.ylabel('trigger $E_T^{\mathrm{jet}}$ [GeV]')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.scatter(all_jet_pairs[:,0], all_jet_pairs[:,1])
  pl.grid(True, which='both')
  pl.xlim(xlim)
  pl.ylim(ylim)
  pl_aJet = {'xdata': all_jet_pairs[:,0],\
             'ydata': all_jet_pairs[:,1]}
  pickle.dump(pl_aJet, file('events_all_jet_pairs_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_all_jet_pairs_%s.png' % filename_ending)
  pl.close()

  '''Leading Offline Jet Pairs'''
  pl.figure()
  pl.xlabel('leading offline $p_T^{\mathrm{jet}}$ [GeV]')
  pl.ylabel('trigger $E_T^{\mathrm{jet}}$ [GeV]')
  pl.title('$p_T^{\mathrm{offline jet}}$ > %d GeV, %d events, $E_T^{\mathrm{tower jet}}$ > %d GeV, $E_T^{\mathrm{seed}}$ > %d GeV' % (offline_jetpT_threshold, num_offlineEvents, gTower_jetET_threshold, seed_filter.ETthresh))
  pl.scatter(leading_offline_jet_pairs[:,0], leading_offline_jet_pairs[:,1])
  pl.grid(True, which='both')
  pl.xlim(xlim)
  pl.ylim(ylim)
  pl_lJet = {'xdata': leading_offline_jet_pairs[:,0],\
             'ydata': leading_offline_jet_pairs[:,1]}
  pickle.dump(pl_lJet, file('events_leading_offline_jet_pairs_%s.pkl' % filename_ending, 'w+') )
  pl.savefig('events_leading_offline_jet_pairs_%s.png' % filename_ending)
  pl.close()
Example #43
0
def loaddata(filename):
    a = root2rec(filename)
    return a
q_eff = [0.69, 0.78, 0.82, 0.85, 0.85, 0.86, 0.85, 0.84, 0.81, 0.75, 0.65, 0.50]
# Convert eV to MeV
for i in range(len(e_ph)):
    e_ph[i] /= 1000000
# Determine the maximum and minimum energies - this determines the
# interpolation range.
interp_min = e_ph[0]
interp_max = e_ph[-1]
# Interpolate to get a function for quantum efficiency in terms of photon
# energy.
q_eff_fn = interpolate.UnivariateSpline(e_ph, q_eff)

# <codecell>

# Get data
data = root2rec("/home/pythontutorial/mountpoint/output/length_study/lengthStudy1000.root", treename="ntp1")
print "got data"

# Get output data file
f = open("processed.dat", "w")

# <codecell>

# Cuts for the plotting of the data
n_hits_plot_cut_low = 0
n_hits_plot_cut_high = 500
# Number of events processed
iEvent_proc = 0
# Arrays of processed number of hits and energy deposit
n_hits_proc = []
energy_proc = []
m = Minuit(x2reg, a=1, b=2, c=3)
m.migrad()
x2reg.show(m)

# <markdowncell>

# ###Let's do some physics
# Remeber the D mass?? Let's try to fit relativistic Breit-Wigner to it.

# <codecell>

from root_numpy import root2rec

# <codecell>

data = root2rec('data/*.root')
bb = root2rec('data/B*.root')
cc = root2rec('data/cc*.root')

# <codecell>

hs = np.hstack
hist([hs(data.DMass), hs(bb.DMass), hs(cc.DMass)], bins=50, histtype='step');

# <markdowncell>

# ###Simple fit
# First lets fit bb's DMass alone with a Breit-Wigner.

# <codecell>
Example #46
0
  bins_towerHistogram    = np.array([0,50,100,150,200,250,300,350,400,500,750,1000,4000]).astype(float)
  bins_efficiency        = np.arange(0,1240, 20).astype(float)

  hist_towerMultiplicity = np.zeros(len(bins_towerMultiplicity)-1).astype(float)
  hist_towerHistogram    = np.zeros(len(bins_towerHistogram)-1).astype(float)
  hist_efficiency_num    = np.zeros(len(bins_efficiency)-1).astype(float)
  hist_efficiency_den    = np.zeros(len(bins_efficiency)-1).astype(float)

  num_offlineEvents = 0

  # main loop that goes over the file
  for event_num in range(total_num_events):
    if event_num % 100 == 0:
      print "doing event_num=%d for (%d, %d, %d)" % (event_num, offline_jetpT_threshold, gTower_jetET_threshold, seed_ETthresh)
    # pull in data row by row
    data = rnp.root2rec(filename, treename='%s/%s' % (directory,tree), branches=offline_column_names + gTower_column_names, start=(event_num), stop=(event_num+1))
    oEvent = OfflineJets.Event(event=[data[col][0] for col in offline_column_names])

    # if there are no offline jets, we skip it
    if len(oEvent.jets) == 0 or oEvent.jets[0].pT < offline_jetpT_threshold:
      continue
    num_offlineEvents += 1

    '''can use seed_filter on an event by event basis'''
    # max number of seeds based on number of offline jets
    #seed_filter = gTowers.SeedFilter(numSeeds = len(oEvent.jets))
    tEvent = gTowers.TowerEvent(event=[data[col][0] for col in gTower_column_names], seed_filter = seed_filter)

    #tEvent.get_event()
    paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.filter_towers())
    #paired_jets = match_jets(oJets=oEvent.jets, tJets=tEvent.event.jets)
Example #47
0
# <codecell>

for i in range(3):
    print 'loop', i

# <markdowncell>

# ###Reading ROOT FILE

# <codecell>

from root_numpy import root2rec

# <codecell>

bb = root2rec('data/B*.root')  #yep that simple
cc = root2rec('data/cc-BtoDpi-all.root')

# <markdowncell>

# ###And plotting

# <codecell>

hist([bb.R2All, cc.R2All],
     bins=100,
     histtype='stepfilled',
     color=['red', 'green'],
     alpha=0.5,
     label=[r'$B\bar{B}$', r'$c\bar{c}$'])
legend().get_frame().set_alpha(0.5)
# Convert eV to MeV
for i in range(len(e_ph)):
    e_ph[i] /= 1000000
# Determine the maximum and minimum energies - this determines the
# interpolation range.
interp_min = e_ph[0]
interp_max = e_ph[-1]
# Interpolate to get a function for quantum efficiency in terms of photon
# energy.
q_eff_fn = interpolate.UnivariateSpline(e_ph, q_eff)

# <codecell>

# Get data
data = root2rec(
    "/home/pythontutorial/mountpoint/output/length_study/lengthStudy1000.root",
    treename="ntp1")
print "got data"

# Get output data file
f = open("processed.dat", "w")

# <codecell>

# Cuts for the plotting of the data
n_hits_plot_cut_low = 0
n_hits_plot_cut_high = 500
# Number of events processed
iEvent_proc = 0
# Arrays of processed number of hits and energy deposit
n_hits_proc = []
Example #49
0
def test_expression():
    rec = rnp.root2rec(load('single*.root'))
    rec2 = rnp.root2rec(load('single*.root'), branches=['f_float*2'])
    assert_array_equal(rec['f_float'] * 2, rec2['f_float*2'])
Example #50
0
# ### ... and continue below the "Save dataframes" section

# <markdowncell>

# # Load data from ROOT trees

# <codecell>

# tree data information for import
branches = ['mcnpart', 'mcid', 'mcm', 'mcp', 'mctheta', 'mcphi']
rootfiles = ['genev000.root', 'genev100.root', 'genev200.root']
# data set names corresponding to each of rootfiles
dfnames = ['000', '100', '200']
# import tree data as 3 numpy structured records
treerecs = [root2rec('../testdata/%s' % fn, branches=branches)[0:10000] for fn in rootfiles]
# associate data sets with dfnames
dfdict = dict(zip(dfnames, [pd.DataFrame(treerec) for treerec in treerecs]))
# create single data frame with multi-index
df = pd.Panel.from_dict(dfdict, orient='minor').swapaxes().to_frame()

# <markdowncell>

# # Add lab-frame 4-momentum components

# <codecell>

dfrad = df[['mctheta','mcphi']]*d2r
dfcos = dfrad.applymap(np.cos)
dfsin = dfrad.applymap(np.sin)
df['pz'] = df.mcp*dfcos.mctheta
Example #51
0
def main():
    ### Paths for KEKCC
    ifpath = '/ghi/fs01/belle2/bdata/group/detector/BEAST/data/NTP/TPC/'
    #ofpath = '/ghi/fs01/belle2/bdata/group/detector/BEAST/data/NTP/TPC/skims/indiv_skims/'

    ### Debug variables
    counter = 0
    r_files = []

    for subdir, dirs, files in os.walk(ifpath):
        for f in files:
            ofpath = '/ghi/fs01/belle2/bdata/group/detector/BEAST/data/NTP/'
            r_file = str(subdir) + str(f)

            test = subdir.split('/')

            #if 'TPC3' in test or 'TPC4' in test or 'skims' in test:
            #    continue

            if 'skims' in test:
                continue

            tpc_num = f.split('_')[0]
            date_dir = subdir.split('/')[-1]
            if 'TPC4' in test:
                date_dir = '2016-05-10'
                print('Date dir is:', date_dir)
            #print('Date dir is:', date_dir)
            print('Directory is:', subdir)

            if tpc_num == 'tpc3':
                ofpath += str('TPC3/')
            elif tpc_num == 'tpc4':
                ofpath += str('TPC4/')

            ofpath += str(date_dir) + str('/')

            if ('badtime' in test or 'old' in test or 'ENV' in test
                    or 'tmp' in test or 'ToRemove' in test):
                continue

            ifile = os.path.join(subdir, f)

            names = f.split('/')
            infile_name = names[-1].split('.')

            tfile = str(infile_name[0]) + str('_skim') + str('.root')
            match = 0

            ofile = str(ofpath) + str(
                infile_name[0]) + str('_skim') + str('.root')

            ### Uncomment this line if only non-existing files are to be generated
            #if os.path.isfile(ofile): continue

            counter += 1

            ### Uncomment these lines if all files must be regenerated
            #input('Warning! You are about to delete all existing files!')
            if os.path.isfile(ofile): os.system('rm %s' % (ofile))

            print('Infile is:', ifile)
            print('Outfile is:', ofile)

            log = str('logs/') + str(f) + str('.log')

            #os.system('bsub -q s -o %s "./refitter %s %s"' % (log, ifile, ofile))
            ### Send large files to long queue, small files to short queue
            df = root2rec(ifile, 'tree', branches='m_event')
            evts = len(df)

            if evts > 60000:
                os.system('bsub -q l -o %s "./refitter %s %s"' %
                          (log, ifile, ofile))
            else:
                os.system('bsub -q s -o %s "./refitter %s %s"' %
                          (log, ifile, ofile))

    if counter == 0:
        sys.path.append('py')
        import job_check
        job_check.main()
Example #52
0
def test_expression():
    rec = rnp.root2rec(load('single*.root'))
    rec2 = rnp.root2rec(load('single*.root'), branches=['f_float*2'])
    assert_array_equal(rec['f_float'] * 2, rec2['f_float*2'])
Example #53
0
def test_vector():
    a = rnp.root2rec(load('vector.root'))
    types = [
        ('v_i', 'O'),
        ('v_f', 'O'),
        ('v_F', 'O'),
        ('v_d', 'O'),
        ('v_l', 'O'),
        ('v_c', 'O'),
        ('v_b', 'O'),
        ('vv_i', 'O'),
        ('vv_f', 'O'),
        ('vv_F', 'O'),
        ('vv_d', 'O'),
        ('vv_l', 'O'),
        ('vv_c', 'O'),
        ('vv_b', 'O'),
    ]
    assert_equal(a.dtype, types)

    assert_equal(a.v_i[0].dtype, np.int32)
    assert_equal(a.v_f[0].dtype, np.float32)
    assert_equal(a.v_F[0].dtype, np.float32)
    assert_equal(a.v_d[0].dtype, np.float64)
    assert_equal(a.v_l[0].dtype, np.int64)
    assert_equal(a.v_c[0].dtype, np.int8)
    assert_equal(a.v_b[0].dtype, np.bool)

    # assert that wrapper array is np.object
    assert_equal(a.vv_i[0].dtype, np.object)
    assert_equal(a.vv_f[0].dtype, np.object)
    assert_equal(a.vv_F[0].dtype, np.object)
    assert_equal(a.vv_d[0].dtype, np.object)
    assert_equal(a.vv_l[0].dtype, np.object)
    assert_equal(a.vv_c[0].dtype, np.object)
    assert_equal(a.vv_b[0].dtype, np.object)

    assert_equal(a.vv_i[0][0].dtype, np.int32)
    assert_equal(a.vv_f[0][0].dtype, np.float32)
    assert_equal(a.vv_F[0][0].dtype, np.float32)
    assert_equal(a.vv_d[0][0].dtype, np.float64)
    assert_equal(a.vv_l[0][0].dtype, np.int64)
    assert_equal(a.vv_c[0][0].dtype, np.int8)
    assert_equal(a.vv_b[0][0].dtype, np.bool)

    # check a few values
    assert_equal(a.v_i[0][0], 1)
    assert_equal(a.v_i[1][1], 3)
    assert_equal(a.v_i[-2][0], 9)
    assert_equal(a.v_i[-2][-1], 17)

    assert_equal(a.v_f[0][0], 2.0)
    assert_equal(a.v_f[1][1], 5.0)
    assert_equal(a.v_f[-2][0], 18.0)
    assert_equal(a.v_f[-2][-1], 26.0)

    assert_equal(a.v_F[0][0], 2.0)
    assert_equal(a.v_F[1][1], 5.0)
    assert_equal(a.v_F[-2][0], 18.0)
    assert_equal(a.v_F[-2][-1], 26.0)

    # more strict conditioning for numpy arrays
    def assert_equal_array(arr1, arr2):
        return assert_equal((arr1 == arr2).all(), True,
            "array mismatch: {0} != {1}".format(arr1, arr2))

    assert_equal_array(a.vv_i[0][0], np.array([1], dtype=np.int32) )
    assert_equal_array(a.vv_i[1][1], np.array([2, 3], dtype=np.int32) )
    assert_equal_array(a.vv_i[-2][0], np.array([9], dtype=np.int32) )
    assert_equal_array(a.vv_i[-2][-1],
                       np.array([ 9, 10, 11, 12, 13, 14, 15, 16, 17],
                                dtype=np.int32))

    assert_equal_array(a.vv_f[0][0], np.array([ 2.], dtype=np.float32) )
    assert_equal_array(a.vv_f[1][1], np.array([ 4.,  5.], dtype=np.float32) )
    assert_equal_array(a.vv_f[-2][0], np.array([ 18.], dtype=np.float32) )
    assert_equal_array(a.vv_f[-2][-1],
                       np.array([ 18.,  19.,  20.,  21.,  22.,
                                  23.,  24.,  25.,  26.],
                                dtype=np.float32))

    assert_equal_array(a.vv_F[0][0], np.array([ 2.], dtype=np.float32) )
    assert_equal_array(a.vv_F[1][1], np.array([ 4.,  5.], dtype=np.float32) )
    assert_equal_array(a.vv_F[-2][0], np.array([ 18.], dtype=np.float32) )
    assert_equal_array(a.vv_F[-2][-1],
                       np.array([ 18.,  19.,  20.,  21.,  22.,
                                  23.,  24.,  25.,  26.],
                                dtype=np.float32))
m = Minuit(x2reg, a=1, b=2, c=3)
m.migrad()
x2reg.show(m)

# <markdowncell>

# ###Let's do some physics
# Remeber the D mass?? Let's try to fit relativistic Breit-Wigner to it.

# <codecell>

from root_numpy import root2rec

# <codecell>

data = root2rec('data/*.root')
bb = root2rec('data/B*.root')
cc = root2rec('data/cc*.root')

# <codecell>

hs = np.hstack
hist([hs(data.DMass), hs(bb.DMass), hs(cc.DMass)], bins=50, histtype='step');

# <markdowncell>

# ###Simple fit
# First lets fit bb's DMass alone with a Breit-Wigner.

# <codecell>
Example #55
0
def get_initial_DataFrame(inFile, TTree_name_arr, eta_bins, pt_bins, pid_dict,
                          classes_str):
    """
    This function loads the data.
    In case the input file 'inFile' is specified, the pandas DataFrame will be constructed from the ROOT input file, cuts will be applied and it will be stored to HDF5 for the next iteration in case the same cuts are to be used but e.g. a different reweighing procedure.
    """
    import os
    from btag_nn_inputs import jet_eta_str, jet_pt_str, default_sample_info

    if inFile:

        from numpy.lib.recfunctions import stack_arrays
        from root_numpy import root2rec

        print 'Convert ROOT file to pandas DataFrame...'
        for i in range(len(TTree_name_arr)):
            if i == 0:
                df = pd.DataFrame(
                    stack_arrays([root2rec(inFile, TTree_name_arr[i])]))
            else:
                df = df.append(pd.DataFrame(
                    stack_arrays([root2rec(inFile, TTree_name_arr[i])])),
                               ignore_index=True)
        print 'conversion complete'
        # only interested in absolute values of eta and the label, so this will speed the calculations up:
        df.update(df[jet_eta_str].abs(), join='left',
                  overwrite=True)  # only use absolute value of eta
        df.update(df['label'].abs(), join='left',
                  overwrite=True)  # only use absolute value of labels
        # dataset selection: pile-up removal and selection in eta, pT acceptance region, limited to b-, c- and light jets:
        if "tau" in pid_dict:
            df = df[(df['label'] == pid_dict.get("b")) |
                    (df['label'] == pid_dict.get("c")) |
                    (df['label'] == pid_dict.get("u")) |
                    (df['label']
                     == pid_dict.get("tau"))]  # jet flavor selection
        else:
            df = df[(df['label'] == pid_dict.get("b")) |
                    (df['label'] == pid_dict.get("c")) |
                    (df['label'] == pid_dict.get("u"))]  # jet flavor selection
        df = df[(df[jet_pt_str] > pt_bins[0]) &
                (df[jet_eta_str] < eta_bins[len(eta_bins) - 1]
                 )]  # jet min-pT and max-abs-eta cut
        df = df[((df['JVT'] > 0.59) & (df[jet_eta_str] < 2.4) &
                 (df[jet_pt_str] < 60.)) | (df[jet_pt_str] >= 60.) |
                (df[jet_eta_str] >=
                 2.4)]  # pile-up removal (use this when working in GeV)
        # store as HDF5 file to speed up the progress for next iteration:
        file_info_str = inFile.split('/')[1].replace(
            '.root', '') + '_' + classes_str + 'jets_pTmax' + str(
                int(pt_bins[len(pt_bins) - 1]) / 1000) + 'GeV'
        df.to_hdf('inputFiles/' + file_info_str + '.h5', 'df')
        print 'saved input data in HDF5 format for next run.'
        return df, file_info_str
    elif not inFile:
        file_info_str = default_sample_info + '_' + classes_str + 'jets_pTmax' + str(
            int(pt_bins[len(pt_bins) - 1]))
        try:
            if not os.path.isfile('inputFiles/' + file_info_str + '.h5'):
                print "File does not exist. Try running the path to the ROOT file as additional argument."
                return False
        except IOError as ex:
            print('({})'.format(e))
        return pd.read_hdf('inputFiles/' + file_info_str + '.h5',
                           'df'), file_info_str
Example #56
0
def test_specific_branch():
    a = rnp.root2rec(load('single1.root'), branches=['f_float'])
    assert_equal(a.dtype, [('f_float', '<f4')])
Example #57
0
def load_data(ipath):
    ''' Loads the dataset

    :type dataset: string
    :param dataset: the path to the dataset (here MNIST)
    '''

    #############
    # LOAD DATA #
    #############

    # Download the MNIST dataset if it is not present
    #data_dir, data_file = os.path.split(dataset)
    #if data_dir == "" and not os.path.isfile(dataset):
    #    # Check if dataset is in the data directory.
    #    new_path = os.path.join(
    #        os.path.split(__file__)[0],
    #        "..",
    #        "data",
    #        dataset
    #    )
    #    if os.path.isfile(new_path) or data_file == 'mnist.pkl.gz':
    #        dataset = new_path

    #if (not os.path.isfile(dataset)) and data_file == 'mnist.pkl.gz':
    #    from six.moves import urllib
    #    origin = (
    #        'http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz'
    #    )
    #    print('Downloading data from %s' % origin)
    #    urllib.request.urlretrieve(origin, dataset)

    print('... loading data')

    alpha_counter = 1
    xray_counter = 1
    data_x = []
    data_y = []
    
    datapath = ipath
    
    for f in os.listdir(datapath):
        ifile = str(datapath) + str(f)
        data = root2rec(ifile)
    
        for event in data :
        
            e_type = 3

            if event.proton == 1 or (event.other == 1 and event.xray == 0): continue
            if (event.bottom_alpha == 1 or event.top_alpha== 1) : alpha_counter += 1
            if alpha_counter % 50 == 0 : 
                e_type = 0
                alpha_counter = 1
            if event.xray == 1 : xray_counter += 1
            if xray_counter % 50 == 0 : 
                e_type = 1
                xray_counter = 1
            if event.neutron == 1 : e_type = 2
            if e_type < 3 :
                data_y.append(e_type)
        
                pix = numpy.zeros([336, 80])
                cols = event.col
                rows = event.row
                for i in range(event.npoints):
                    pix[rows[i]-1][cols[i]-1] = 1
                    #pix[rows[i]-1][cols[i]-1] = (event.tot[i]+1)/16
                    #pix[rows[i]-1][cols[i]-1] = (event.tot[i]+1)
                pix = numpy.reshape(pix, 336*80)
                data_x.append(pix)
    
    data_x = numpy.asarray(data_x, dtype=numpy.float32)
    data_y = numpy.asarray(data_y, dtype=numpy.float32)

    train = int(len(data_x) * 0.8)
    test = int((len(data_x)-train)/2)

    train_x = data_x[:train]
    train_y = data_y[:train]

    valid_x = data_x[train:-(test+1)]
    valid_y = data_y[train:-(test+1)]

    test_x = data_x[-test:]
    test_y = data_y[-test:]

    #test_x = data_x[train:train+test]
    #test_y = data_y[train:train+test]

    #valid_x = data_x[:test]
    #valid_y = data_y[:test]

    test_set = [test_x, test_y]
    valid_set = [valid_x, valid_y]
    train_set = [train_x, train_y]
    # Load the dataset
    #with gzip.open(dataset, 'rb') as f:
    #    try:
    #        train_set, valid_set, test_set = pickle.load(f, encoding='latin1')
    #    except:
    #        train_set, valid_set, test_set = pickle.load(f)
    # train_set, valid_set, test_set format: tuple(input, target)
    # input is a numpy.ndarray of 2 dimensions (a matrix)
    # where each row corresponds to an example. target is a
    # numpy.ndarray of 1 dimension (vector) that has the same length as
    # the number of rows in the input. It should give the target
    # to the example with the same index in the input.

    def shared_dataset(data_xy, borrow=True):
        """ Function that loads the dataset into shared variables

        The reason we store our dataset in shared variables is to allow
        Theano to copy it into the GPU memory (when code is run on GPU).
        Since copying data into the GPU is slow, copying a minibatch everytime
        is needed (the default behaviour if the data is not in a shared
        variable) would lead to a large decrease in performance.
        """
        data_x, data_y = data_xy
        shared_x = theano.shared(numpy.asarray(data_x,
                                               dtype=theano.config.floatX),
                                 borrow=borrow)
        shared_y = theano.shared(numpy.asarray(data_y,
                                               dtype=theano.config.floatX),
                                 borrow=borrow)
        # When storing data on the GPU it has to be stored as floats
        # therefore we will store the labels as ``floatX`` as well
        # (``shared_y`` does exactly that). But during our computations
        # we need them as ints (we use labels as index, and if they are
        # floats it doesn't make sense) therefore instead of returning
        # ``shared_y`` we will have to cast it to int. This little hack
        # lets ous get around this issue
        #shared_x = shared_x.flatten()
        return shared_x, T.cast(shared_y, 'int32')

    test_set_x, test_set_y = shared_dataset(test_set)
    valid_set_x, valid_set_y = shared_dataset(valid_set)
    train_set_x, train_set_y = shared_dataset(train_set)

    rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y),
            (test_set_x, test_set_y)]
    return rval
Example #58
0
def test_vector():
    a = rnp.root2rec(load('vector.root'))
    types = [
        ('v_i', 'O'),
        ('v_f', 'O'),
        ('v_F', 'O'),
        ('v_d', 'O'),
        ('v_l', 'O'),
        ('v_c', 'O'),
        ('v_b', 'O'),
        ('vv_i', 'O'),
        ('vv_f', 'O'),
        ('vv_F', 'O'),
        ('vv_d', 'O'),
        ('vv_l', 'O'),
        ('vv_c', 'O'),
        ('vv_b', 'O'),
    ]
    assert_equal(a.dtype, types)

    assert_equal(a.v_i[0].dtype, np.int32)
    assert_equal(a.v_f[0].dtype, np.float32)
    assert_equal(a.v_F[0].dtype, np.float32)
    assert_equal(a.v_d[0].dtype, np.float64)
    assert_equal(a.v_l[0].dtype, np.int64)
    assert_equal(a.v_c[0].dtype, np.int8)
    assert_equal(a.v_b[0].dtype, np.bool)

    # assert that wrapper array is np.object
    assert_equal(a.vv_i[0].dtype, np.object)
    assert_equal(a.vv_f[0].dtype, np.object)
    assert_equal(a.vv_F[0].dtype, np.object)
    assert_equal(a.vv_d[0].dtype, np.object)
    assert_equal(a.vv_l[0].dtype, np.object)
    assert_equal(a.vv_c[0].dtype, np.object)
    assert_equal(a.vv_b[0].dtype, np.object)

    assert_equal(a.vv_i[0][0].dtype, np.int32)
    assert_equal(a.vv_f[0][0].dtype, np.float32)
    assert_equal(a.vv_F[0][0].dtype, np.float32)
    assert_equal(a.vv_d[0][0].dtype, np.float64)
    assert_equal(a.vv_l[0][0].dtype, np.int64)
    assert_equal(a.vv_c[0][0].dtype, np.int8)
    assert_equal(a.vv_b[0][0].dtype, np.bool)

    # check a few values
    assert_equal(a.v_i[0][0], 1)
    assert_equal(a.v_i[1][1], 3)
    assert_equal(a.v_i[-2][0], 9)
    assert_equal(a.v_i[-2][-1], 17)

    assert_equal(a.v_f[0][0], 2.0)
    assert_equal(a.v_f[1][1], 5.0)
    assert_equal(a.v_f[-2][0], 18.0)
    assert_equal(a.v_f[-2][-1], 26.0)

    assert_equal(a.v_F[0][0], 2.0)
    assert_equal(a.v_F[1][1], 5.0)
    assert_equal(a.v_F[-2][0], 18.0)
    assert_equal(a.v_F[-2][-1], 26.0)

    # more strict conditioning for numpy arrays
    def assert_equal_array(arr1, arr2):
        return assert_equal((arr1 == arr2).all(), True,
            "array mismatch: {0} != {1}".format(arr1, arr2))

    assert_equal_array(a.vv_i[0][0], np.array([1], dtype=np.int32) )
    assert_equal_array(a.vv_i[1][1], np.array([2, 3], dtype=np.int32) )
    assert_equal_array(a.vv_i[-2][0], np.array([9], dtype=np.int32) )
    assert_equal_array(a.vv_i[-2][-1],
                       np.array([ 9, 10, 11, 12, 13, 14, 15, 16, 17],
                                dtype=np.int32))

    assert_equal_array(a.vv_f[0][0], np.array([ 2.], dtype=np.float32) )
    assert_equal_array(a.vv_f[1][1], np.array([ 4.,  5.], dtype=np.float32) )
    assert_equal_array(a.vv_f[-2][0], np.array([ 18.], dtype=np.float32) )
    assert_equal_array(a.vv_f[-2][-1],
                       np.array([ 18.,  19.,  20.,  21.,  22.,
                                  23.,  24.,  25.,  26.],
                                dtype=np.float32))

    assert_equal_array(a.vv_F[0][0], np.array([ 2.], dtype=np.float32) )
    assert_equal_array(a.vv_F[1][1], np.array([ 4.,  5.], dtype=np.float32) )
    assert_equal_array(a.vv_F[-2][0], np.array([ 18.], dtype=np.float32) )
    assert_equal_array(a.vv_F[-2][-1],
                       np.array([ 18.,  19.,  20.,  21.,  22.,
                                  23.,  24.,  25.,  26.],
                                dtype=np.float32))
Example #59
0
def getvar(var, filename):
    leaves = [var]
    array = root2rec(filename, 'tree', leaves)
    vars = array[var]
    return vars
Example #60
0
 def load(self):
     self.data = rnp.root2rec(self.filename,
                              '%s/%s' % (self.directory, self.tree))
     print "Loaded %s:%s/%s" % (self.filename.split(
         os.sep)[-1], self.directory, self.tree)