Exemple #1
0
def saveTree(processPath,dictVar,vector,MVAVector=None,SF=1,nameTree="reducedTree"):
    from root_numpy import array2root
    i=0
    for key in dictVar.keys():

         if i == 0:
             writeMode='recreate'
             i=1
         else:
             writeMode='update'

         v=(np.asarray(vector[:,dictVar[key]]))
         name = key

         if key == 'diphotonCandidate.M()':
             name = 'Mgg'
         elif key == 'dijetCandidate.M()':
             name = 'Mjj'
         #elif key == 'HHTagger2017':
         #    name = 'MVAOutput'

         if SF != 1:
             if key == 'weight':
                 v = (np.multiply(np.asarray(v),SF))

         v.dtype = [(name.replace(".","").replace("(","").replace(")","").replace("/","_Over_").replace("_","").replace("Candidate",""), np.float64)]


         array2root(v, processPath, nameTree, mode = writeMode)

    if MVAVector is not None: 
        v=(np.asarray(MVAVector.ravel()))
        v.dtype = [('MVAOutput', np.float64)]
        array2root(v, processPath, nameTree, mode ='update')
Exemple #2
0
def test_array2tree_fixed_length_arrays():
    f = load(['fixed1.root', 'fixed2.root'])
    a = rnp.root2array(f)
    with temp() as tmp:
        rnp.array2root(a, tmp.GetName(), mode='recreate')
        a_conv = rnp.root2array(tmp.GetName())
        assert_array_equal(a, a_conv)
def write_prediction_to_file(features,
                             scaler,
                             model,
                             filename='',
                             treename='',
                             branch=''):
    data_out = features
    features = features[:, 0:NDIM]
    features = scaler.transform(features)
    y_predict_all = model.predict(features)  # normal numpy array
    data_out = numpy.concatenate((data_out, y_predict_all), axis=1)
    dtype = numpy.dtype([('layer', numpy.float32), ('n1', numpy.float32),
                         ('n2', numpy.float32), ('n3', numpy.float32),
                         ('n4', numpy.float32), ('n5', numpy.float32),
                         ('n6', numpy.float32), ('un1', numpy.float32),
                         ('un2', numpy.float32), ('un3', numpy.float32),
                         ('un4', numpy.float32), ('un5', numpy.float32),
                         ('un6', numpy.float32), ('dn1', numpy.float32),
                         ('dn2', numpy.float32), ('dn3', numpy.float32),
                         ('dn4', numpy.float32), ('dn5', numpy.float32),
                         ('dn6', numpy.float32), ('nup', numpy.float32),
                         ('ndown', numpy.float32), ('event', numpy.float32),
                         ('rechitsum', numpy.float32),
                         ('rechit', numpy.float32),
                         ('rechit_dnn', numpy.float32)])

    data_out = numpy.core.records.fromarrays(
        data_out.transpose(), dtype=dtype)  # structured numpy array
    root_numpy.array2root(data_out,
                          filename,
                          treename=treename,
                          mode='recreate')
    def writeOutPrediction(self, predicted, features, truth, weights,
                           outfilename, inputfile):
        # predicted will be a list
        spectator_branches = ['jet_pt', 'jet_eta']
        from root_numpy import array2root
        if inputfile[-5:] == 'djctd':
            print(
                "storing normed pt and eta... run on root files if you want something else for now"
            )
            spectators = features[0][:, 0:2].transpose()
        else:
            import uproot3 as uproot
            print(inputfile)
            urfile = uproot.open(inputfile)["deepntuplizer/tree"]
            spectator_arrays = urfile.arrays(spectator_branches)
            print(spectator_arrays)
            spectators = [
                spectator_arrays[a.encode()] for a in spectator_branches
            ]

        out = np.core.records.fromarrays(
            np.vstack(
                (predicted[0].transpose(), truth[0].transpose(), spectators)),
            names=
            'prob_isB, prob_isBB, prob_isC,prob_isUDSG,isB, isBB, isC,isUDSG,jet_pt, jet_eta'
        )
        array2root(out, outfilename, 'tree')
Exemple #5
0
    def _predict(args):
        data_iter = data_loader(args)

        preds = model.predict(data_iter).asnumpy()
        truths = data_iter.get_truths()
        observers = data_iter.get_observers()

        print(preds.shape, truths.shape, observers.shape)

        pred_output = {}
        for i, label in enumerate(data_iter._data_format.class_labels):
            pred_output['class_%s' % label] = truths[:, i]
            pred_output['score_%s' % label] = preds[:, i]
        for i, obs in enumerate(data_iter._data_format.obs_vars):
            pred_output[obs] = observers[:, i]

        import pandas as pd
        df = pd.DataFrame(pred_output)
        if args.predict_output:
            logging.info('Write prediction file to %s' % args.predict_output)
            outdir = os.path.dirname(args.predict_output)
            if not os.path.exists(outdir):
                os.makedirs(outdir)
            df.to_hdf(args.predict_output, 'Events', format='table')

            from common.util import plotROC
            plotROC(preds, truths, output=os.path.join(outdir, 'roc.pdf'))

            from root_numpy import array2root
            array2root(df.to_records(index=False),
                       filename=args.predict_output.rsplit('.', 1)[0] +
                       '.root',
                       treename='Events',
                       mode='RECREATE')
    def writeOutPrediction(self, predicted, features, truth, weights,
                           outfilename, inputfile):
        # predicted is a list
        print("Prediction started")
        from root_numpy import array2root

        namesstring = 'prob_isPrompt, prob_isNonPrompt, prob_isFake, prob_lep_isFromSUSYandHF,'  #prob_isFromSUSY, prob_isFromSUSYHF,'
        for label in self.truth_branches:
            namesstring += label + ', '
        features_string = ', '.join(self.global_branches)
        namesstring += features_string
        out = np.core.records.fromarrays(np.vstack(
            (predicted[0].transpose(), truth[0].transpose(),
             features[0][:, :].transpose())),
                                         names=namesstring)

        # if one predicts on a DataCollection one has to change
        # the file extension to .root
        if not outfilename.endswith(".root"):
            print("Predicted from a DataCollection make root files")
            print("Note that outfiles files extensions must be adapted...")
            filename, _ = os.path.splitext(outfilename)
            outfilename = filename + ".root"
        print("making {}".format(outfilename.split('/')[-1]))
        array2root(out, outfilename, 'tree')
Exemple #7
0
    def save_data(self, metadata, data):
        if self.treemaker.uses_arrays:
            # Activate Joey's array saving code
            dataframe_to_root(data,
                              self.path,
                              treename=self.treemaker.__name__,
                              mode='recreate')

        else:
            # Check we really aren't using arrays, otherwise we'll crash with a very uninformative message
            for branch_name in data.columns:
                if is_array_field(data, branch_name):
                    raise TypeError(
                        "Column %s is an array field, and you want to save to root. Either "
                        "(1) use MultipleRowExtractor-based minitrees; or "
                        "(2) add a uses_arrays=True attribute to the %s class; or "
                        "(3) use pickle as your minitree format." %
                        (branch_name, self.treemaker.__class__.__name__))
            root_numpy.array2root(data.to_records(),
                                  self.path,
                                  treename=self.treemaker.__name__,
                                  mode='recreate')

        # Add metadata as JSON in a TNamed in the same ROOT file
        bla = ROOT.TNamed('metadata', json.dumps(metadata))
        minitree_f = ROOT.TFile(self.path, 'UPDATE')
        bla.Write()
        minitree_f.Close()
Exemple #8
0
def run(name, source, quick=False):
    print time.asctime(time.localtime()), "Filling BDT Branches"  

    branch_names = joblib.load("pickle/variables.pkl")
    
    if quick == True:
        signal = joblib.load('pickle/all_signalq.pkl')   
        clf = joblib.load("pickle/" + name + "quick.pkl")     
    else:
        signal = joblib.load('pickle/all_signal.pkl')
        clf = joblib.load("pickle/" + name + ".pkl")

    # predict and write probability of each MC event being signal
    bdt_MC_predicted = clf.predict_proba(signal)
    bdt_MC_predicted.dtype = [('GradBoost_prob', np.float64)]
    array2root((np.hsplit(bdt_MC_predicted,2)[1]), "/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA_BDT.root", "DecayTree")

    # predict and write probability of every data event being signal
    all_data = root2array("/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree", branch_names)
    all_data = rec2array(all_data)

    bdt_data_predicted = clf.predict_proba(all_data)
    bdt_data_predicted.dtype = [('GradBoost_prob', np.float64)]
    array2root((np.hsplit(bdt_data_predicted,2)[1]), "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree")
        
    print time.asctime(time.localtime()), "Branches Filled!"
Exemple #9
0
    def _predict(self,
                 X,
                 features_names=None,
                 model_type=('classification', None)):
        """
        Predict data

        :param pandas.DataFrame X: data shape [n_samples, n_features]
        :return: predicted values of shape n_samples
        """
        self._check_fitted()

        directory = self._create_tmp_directory()
        try:
            with tempfile.NamedTemporaryFile(mode="w",
                                             suffix='.xml',
                                             dir=directory,
                                             delete=True) as file_xml:
                file_xml.write(self.formula_xml)
                file_xml.flush()
                add_info = _AdditionalInformationPredict(directory,
                                                         file_xml.name,
                                                         features_names,
                                                         self._method_name,
                                                         model_type=model_type)
                root_numpy.array2root(X.astype(numpy.float32).to_records(),
                                      filename=add_info.filename,
                                      treename=add_info.treename)
                prediction = self._run_tmva_predict(add_info)
        finally:
            self._remove_tmp_directory(directory)

        return prediction
Exemple #10
0
    def _fit(self,
             X,
             y,
             sample_weight=None,
             features_names=None,
             model_type='classification'):
        """
        Train the classifier

        :param pandas.DataFrame X: data shape [n_samples, n_features]
        :param list | numpy.array y: values - array-like of shape [n_samples]
        :param list | numpy.array sample_weight: weight of events,
               array-like of shape [n_samples] or None if all weights are equal
        :return: self
        """
        # saving data to 2 different root files.
        directory = self._create_tmp_directory()
        add_info = _AdditionalInformation(directory,
                                          features_names,
                                          model_type=model_type)
        try:
            X[add_info.weight_column] = sample_weight
            X[add_info.target_column] = y
            root_numpy.array2root(X.to_records(),
                                  filename=add_info.filename,
                                  treename=add_info.treename)
            self._run_tmva_training(add_info)
        finally:
            self._remove_tmp_directory(directory)

        return self
Exemple #11
0
def test_array2tree_fixed_length_arrays():
    f = load(['fixed1.root', 'fixed2.root'])
    a = rnp.root2array(f)
    with temp() as tmp:
        rnp.array2root(a, tmp.GetName(), mode='recreate')
        a_conv = rnp.root2array(tmp.GetName())
        assert_array_equal(a, a_conv)
Exemple #12
0
 def writeOutPrediction(self, predicted, features, truth, weights, outfilename, inputfile):
     # predicted will be a list
     
     from root_numpy import array2root
     out = np.core.records.fromarrays(np.vstack( (predicted[0].transpose(),truth[0].transpose(), features[0][:,0:2].transpose() ) ),
                                      names='prob_isPrompt, prob_isNonPrompt, prob_isFake, lep_pt, lep_eta')
     array2root(out, outfilename, 'tree')
Exemple #13
0
def run(name, source, quick=False):
    print time.asctime(time.localtime()), "Filling BDT Branches"

    branch_names = joblib.load("pickle/variables.pkl")

    if quick == True:
        signal = joblib.load('pickle/all_signalq.pkl')
        clf = joblib.load("pickle/" + name + "quick.pkl")
    else:
        signal = joblib.load('pickle/all_signal.pkl')
        clf = joblib.load("pickle/" + name + ".pkl")

    # predict and write probability of each MC event being signal
    bdt_MC_predicted = clf.predict_proba(signal)
    bdt_MC_predicted.dtype = [('GradBoost_prob', np.float64)]
    array2root((
        np.hsplit(bdt_MC_predicted, 2)[1]
    ), "/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA_BDT.root",
               "DecayTree")

    # predict and write probability of every data event being signal
    all_data = root2array(
        "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root",
        "DecayTree", branch_names)
    all_data = rec2array(all_data)

    bdt_data_predicted = clf.predict_proba(all_data)
    bdt_data_predicted.dtype = [('GradBoost_prob', np.float64)]
    array2root((
        np.hsplit(bdt_data_predicted, 2)[1]
    ), "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root",
               "DecayTree")

    print time.asctime(time.localtime()), "Branches Filled!"
    def SaveToRoot(self,df,path_output,output_name=None,out_idx=''):
        # Get the unique samples as a list #
        if output_name is None:
            sample_list = list(df[parameters.split_name].unique())

            # Loop over samples #
            for sample in sample_list:
                sample_df = df.loc[df[parameters.split_name]==sample] # We select the rows corresponding to this sample

                # Remove tag and sample name (info in target as bool) #
                sample_df = sample_df.drop('tag',axis=1)
                sample_df = sample_df.drop('sample',axis=1)

                # From df to numpy array with dtype #
                sample_output = sample_df.to_records(index=False,column_dtypes='float64')
                sample_output.dtype.names = parameters.make_dtype(sample_output.dtype.names)# because ( ) and . are an issue for root_numpy
                sample_output_name = os.path.join(path_output,sample+out_idx+'.root')

                # Save as root file #
                array2root(sample_output,sample_output_name,mode='recreate')
                logging.info('Output saved as : '+sample_output_name)
        else:
            # From df to numpy array with dtype #
            full_output = df.to_records(index=False,column_dtypes='float64')
            full_output.dtype.names = parameters.make_dtype(full_output.dtype.names)# because ( ) and . are an issue for root_numpy
            full_output_name = os.path.join(path_output,output_name)
            array2root(full_output,full_output_name,mode='recreate')
            logging.info('Output saved as : '+full_output_name)
Exemple #15
0
    def writeOutPrediction(self, predicted, features, truth, weights,
                           outfilename, inputfile):
        # predicted will be a list
        print('writeout')
        print('predicted', predicted[0].shape)
        print('features', features[0].shape)
        print('truth', truth[0].shape)

        def unroll(a):
            a = np.reshape(a,
                           [a.shape[0], a.shape[1] * a.shape[2], a.shape[3]])
            return a

        #unroll to event x vector
        # first 100 are enough for now
        parr = predicted[0][:100, ...]  #unroll(predicted[0])
        farr = features[0][:100, ...]  #unroll(features[0])
        tarr = truth[0][:100, ...]  #unroll(truth[0])

        from DeepJetCore.TrainData import TrainData
        #use traindata as data storage
        td = TrainData()
        td._store([parr, farr, tarr], [], [])
        td.writeToFile(outfilename)

        return

        from root_numpy import array2root
        out = np.core.records.fromarrays(
            [
                parr[:, :, 0],
                parr[:, :, 1],
                parr[:, :, 2],
                parr[:, :, 3],
                parr[:, :, 4],
                parr[:, :, 5],
                parr[:, :, 6],
                parr[:, :, 7],
                parr[:, :, 9],
                parr[:, :, 10],
                tarr[:, :, 0],
                tarr[:, :, 1],
                tarr[:, :, 2],
                tarr[:, :, 3],
                tarr[:, :, 4],
                tarr[:, :, 5],
                tarr[:, :, 6],
                tarr[:, :, 7],
                farr[:, :, 0],
                farr[:, :, 1],
                farr[:, :, 2],
                farr[:, :, 3],
                farr[:, :, 4],
            ],
            names=
            'p_beta, p_posx, p_posy, p_ID0, p_ID1, p_ID2, p_dim1, p_dim2, p_ccoords1, p_coords2, t_mask, t_posx, t_posy, t_ID0, t_ID1, tID_2, t_dim1, t_dim2, f_r, f_g, f_b, f_x, f_y'
        )

        array2root(out, outfilename, 'tree')
        '''
Exemple #16
0
    def write_tree(self):
        array = []

        all_branches = default_branches + ["label", "train_id"
                                           ] + self.additional_branches
        for aux in all_branches:
            print aux
            a = self.append_arrays(aux)
            if aux == "label" or aux == "train_id" or "dnn_score" in aux:
                tree_name = aux
            elif "evt_weight" in aux:
                tree_name = "weight"
            else:
                tree_name = aux[:-1]
            a_ = self.create_structured_array(a, tree_name)
            array.append(a_)

        for mva in self.mva_helpers.keys():
            a = [
                y for x in [
                    self.mva_helpers[mva]["prediction"]["train"],
                    self.mva_helpers[mva]["prediction"]["test"],
                    self.mva_helpers[mva]["prediction"]["data"]
                ] for y in x
            ]
            a_ = self.create_structured_array(a, mva)
            array.append(a_)

        merged_array = rfn.merge_arrays(array, flatten=True, usemask=False)

        self.output_root = self.output + ".root"
        os.system("rm %s" % self.output_root)
        root_numpy.array2root(merged_array, self.output_root, treename="t")
Exemple #17
0
def test_drop_nonscalar_columns():
    array = np.array([1, 2, 3])
    matrix = np.array([[1, 2, 3], [4, 5, 6]])
    bool_matrix = np.array([[True, False, True], [True, True, True]])

    dt = np.dtype([
        ('a', 'i4'),
        ('b', 'int64', array.shape),
        ('c', 'int64', matrix.shape),
        ('d', 'bool_'),
        ('e', 'bool_', matrix.shape)
        ])
    arr = np.array([
        (3, array, matrix, True, bool_matrix),
        (2, array, matrix, False, bool_matrix)],
        dtype=dt)

    path = 'tmp.root'
    array2root(arr, path, 'ntuple', mode='recreate')

    df = read_root(path, flatten=False)
    # the above line throws an error if flatten=True because nonscalar columns
    # are dropped only after the flattening is applied. However, the flattening
    # algorithm can not deal with arrays of more than one dimension.
    assert(len(df.columns) == 2)
    assert(np.all(df.index.values == np.array([0, 1])))
    assert(np.all(df.a.values == np.array([3, 2])))
    assert(np.all(df.d.values == np.array([True, False])))

    os.remove(path)
Exemple #18
0
def test_nonscalar_columns():
    array = np.array([1, 2, 3], dtype=np.int64)
    matrix = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
    bool_matrix = np.array([[True, False, True], [True, True, True]],
                           dtype=np.bool_)

    dt = np.dtype([('a', 'i4'), ('b', 'int64', array.shape),
                   ('c', 'int64', matrix.shape), ('d', 'bool_'),
                   ('e', 'bool_', matrix.shape)])
    arr = np.array([(3, array, matrix, True, bool_matrix),
                    (2, array, matrix, False, bool_matrix)],
                   dtype=dt)

    reference_df = pd.DataFrame()
    reference_df['a'] = np.array([3, 2], dtype=np.int32)
    reference_df['b'] = to_object_array([array, array])
    reference_df['c'] = to_object_array([matrix, matrix])
    reference_df['d'] = np.array([True, False], dtype=np.bool_)
    reference_df['e'] = to_object_array([bool_matrix, bool_matrix])

    path = 'tmp.root'
    array2root(arr, path, 'ntuple', mode='recreate')
    df = read_root(path, flatten=False)
    assert_frame_equal(df, reference_df)

    os.remove(path)
Exemple #19
0
def to_root(df, path, tree_key="default", mode='w', *kargs, **kwargs):
    """
    Write DataFrame to a ROOT file.

    Parameters
    ----------
    path: string
        File path to new ROOT file (will be overwritten)
    tree_key: string
        Name of tree that the DataFrame will be saved as
    mode: string, {'w', 'a'}
        Mode that the file should be opened in (default: 'w')
    
    Notes
    -----

    Further *kargs and *kwargs are passed to root_numpy's array2root.

    >>> df = DataFrame({'x': [1,2,3], 'y': [4,5,6]})
    >>> df.to_root('test.root')
    
    The DataFrame index will be saved as a branch called 'index'.
    """

    if mode == 'a':
        mode = 'update'
    elif mode == 'w':
        mode = 'recreate'
    else:
        raise ValueError('Unknown mode: {}. Must be "a" or "w".'.format(mode))

    from root_numpy import array2root
    arr = df.to_records()
    array2root(arr, path, tree_key, mode=mode, *kargs, **kwargs)
def prepareOutput(outputD, ll, plotsD, rootFile, NN_MVA):
    NN_Output = h5py.File("%sNN_Output_applied_%s.h5"%(outputD,ll), "r+")
    mZ_x, mZ_y = (NN_Output['MET_GroundTruth'][:,0]), (NN_Output['MET_GroundTruth'][:,1])
    a_x, a_y = (NN_Output['MET_Predictions'][:,0]), (NN_Output['MET_Predictions'][:,1])
    mZ_r, mZ_phi =  kar2pol(mZ_x, mZ_y)
    mZ_r = NN_Output['Boson_Pt'][:]
    a_r, a_phi = kar2pol(a_x, a_y)


    NN_LongZ, NN_PerpZ = -np.cos(angularrange(np.add(a_phi,-mZ_phi)))*a_r, np.sin(angularrange(a_phi-mZ_phi))*a_r

    #HDF5
    dset = NN_MVA.create_dataset("NN_LongZ", dtype='d', data=NN_LongZ)
    dset1 = NN_MVA.create_dataset("NN_PerpZ", dtype='d', data=NN_PerpZ)
    dset2 = NN_MVA.create_dataset("NN_Phi", dtype='d', data=a_phi)
    dset3 = NN_MVA.create_dataset("NN_Pt", dtype='d', data=a_r)
    dset4 = NN_MVA.create_dataset("Boson_Pt", dtype='d', data=mZ_r)
    dset5 = NN_MVA.create_dataset("NN_x", dtype='d', data=a_x)
    dset6 = NN_MVA.create_dataset("NN_y", dtype='d', data=a_y)
    dset7 = NN_MVA.create_dataset("Boson_x", dtype='d', data=mZ_x)
    dset8 = NN_MVA.create_dataset("Boson_y", dtype='d', data=mZ_y)
    dset9 = NN_MVA.create_dataset("Boson_Phi", dtype='d', data=mZ_phi)
    NN_MVA.close()

    #Root
    #treename = ll+"_nominal/ntuple"
    #Root_array = rnp.root2array(rootFile, treename=treename)
    #print("shape Root_array", Root_array.shape)
    NN_array = np.array([(a_r[i], a_phi[i], a_x[i], a_y[i], NN_LongZ[i], NN_PerpZ[i]) for i in range(len(a_r))],
              dtype=[('NN_Pt', np.float32), ('NN_Phi', np.float32), ('NN_x', np.float32), ('NN_y', np.float32), ('NN_LongZ', np.float32), ('NN_PerpZ', np.float32)])
    print("shape NN_array", NN_array.shape)   
    rnp.array2root(NN_array, "%s/NN_MVA_%s.root"%(outputDir,ll), mode='recreate')
Exemple #21
0
    def makePrediction(self, model, testdatacollection, outputDir, ident=''):
        import numpy as np
        from root_numpy import array2root
        import os

        outputDir = os.path.abspath(outputDir)

        if len(ident) > 0:
            ident = '_' + ident

        self.__sourceroots = []
        self.__predictroots = []
        self.metrics = []

        for i in range(len(testdatacollection.samples)):
            sample = testdatacollection.samples[i]
            originroot = testdatacollection.originRoots[i]
            outrootfilename = os.path.basename(originroot).split(
                '.')[0] + '_predict' + ident + '.root'

            fullpath = testdatacollection.getSamplePath(sample)
            td = testdatacollection.dataclass

            td.readIn(fullpath)
            truthclasses = td.getUsedTruth()
            regressionclasses = td.regressiontargetclasses

            formatstring = ','.join(
                ['prob_%s%s' % (i, ident) for i in truthclasses])
            features = td.x
            labels = td.y
            #metric=model.evaluate(features, labels, batch_size=10000)
            prediction = model.predict(features)
            if isinstance(prediction, list):
                ######CHANGE FOR NEW FORMAT
                formatstring += ','
                formatstring += ','.join(
                    ['reg_%s%s' % (i, ident) for i in regressionclasses])
                all_write = np.concatenate(prediction, axis=1)

            elif prediction.shape[1] == len(truthclasses):
                all_write = prediction
            else:
                raise ValueError(
                    'Regression (2nd prediction output) can only have up to two values!'
                )

            all_write = np.core.records.fromarrays(np.transpose(all_write),
                                                   names=formatstring)
            array2root(all_write,
                       outputDir + '/' + outrootfilename,
                       "tree",
                       mode="recreate")

            #self.metrics.append(metric)
            self.__sourceroots.append(originroot)
            self.__predictroots.append(outputDir + '/' + outrootfilename)
            print(formatstring)
            print('\ncreated predition friend tree ' + outputDir + '/' +
                  outrootfilename + ' for ' + originroot)
Exemple #22
0
def getArrayToRoot(rec_array, foutname, treename):
    """
        Convert and return a tree into a numpy array
        Inputs: TTree object
    """
    from root_numpy import array2root
    info('(getArrayToRoot) building tree %s in file %s' % (treename, foutname))
    array2root(rec_array, foutname, treename)
Exemple #23
0
def write_output_tree(allparticles, outputFile):
    from root_numpy import array2root
    out = np.core.records.fromarrays(
        allparticles.transpose(),
        names=
        "is_reco, reco_posx, reco_posy, reco_e, is_true, true_posx, true_posy, true_e, true_id, n_true"
    )
    array2root(out, outputFile + ".root", 'tree')
Exemple #24
0
def test_array2tree_charstar():
    a = np.array([b'', b'a', b'ab', b'abc', b'xyz', b''],
                 dtype=[('string', 'S3')])

    with temp() as tmp:
        rnp.array2root(a, tmp.GetName(), mode='recreate')
        a_conv = rnp.root2array(tmp.GetName())
        assert_array_equal(a, a_conv)
 def _saveAsROOT(self):
     output = self.data.to_records(index=False, column_dtypes='float64')
     output.dtype.names = [(name.replace('.', 'p').replace('(', '').replace(
         ')', '').replace('-', '_minus_').replace('*', '_times_'))
                           for name in output.dtype.names
                           ]  # root_numpy issues
     array2root(output, self.save_path, mode='recreate')
     print('Output saved as : ' + self.save_path)
Exemple #26
0
def test_array2tree_charstar():
    a = np.array([b'', b'a', b'ab', b'abc', b'xyz', b''],
                 dtype=[('string', 'S3')])

    with temp() as tmp:
        rnp.array2root(a, tmp.GetName(), mode='recreate')
        a_conv = rnp.root2array(tmp.GetName())
        assert_array_equal(a, a_conv)
Exemple #27
0
    def writeOutPrediction(self, predicted, features, truth, weights,
                           outfilename, inputfile):
        # predicted will be a list

        from root_numpy import array2root
        out = np.core.records.fromarrays(predicted[0].transpose(),
                                         names='prob_p, prob_np, prob_f')

        array2root(out, outfilename, 'tree')
Exemple #28
0
def add_to_rootfile(rootfile, new_branch, branch_name=None, overwrite=True):
    """Adds a new branch to a given root file.

    .. warning:: Overwrite not working currently!


    Parameters
    ----------
    rootfile : root-dict
        The ROOT-file where the data should be added
    new_branch : numpy.array 1-D, list, root-dict
        A one-dimensional numpy array that contains the data.
    branch_name : str
        The name of the branche resp. the name in the dtype of the array.
    """
    from root_numpy import root2array, array2tree

    from rootpy.io import root_open

    rootfile = dev_tool.entries_to_str(rootfile)
    new_branch = dev_tool.entries_to_str(new_branch)
    branch_name = dev_tool.entries_to_str(branch_name)

    # get the right parameters
    # TODO: what does that if there? an assertion maybe?
    write_mode = 'update'
    branch_name = 'new_branch1' if branch_name is None else branch_name

    if isinstance(rootfile, dict):
        filename = rootfile.get('filenames')
    treename = rootfile.get('treename')
    new_branch = to_ndarray(new_branch)
    #    new_branch.dtype = [(branch_name, 'f8')]

    # write to ROOT-file
    write_to_root = False

    if os.path.isfile(filename):
        with root_open(filename, mode='a') as root_file:
            tree = getattr(root_file, treename)  # test
            if not tree.has_branch(branch_name):
                write_to_root = True
    # array2tree(new_branch, tree=tree)
    #            f.write("", TObject.kOverwrite)  # overwrite, does not create friends
    else:
        write_mode = 'recreate'
        write_to_root = True
    if write_to_root:
        arr = np.core.records.fromarrays([new_branch], names=branch_name)
        array2root(arr=arr,
                   filename=filename,
                   treename=treename,
                   mode=write_mode)
        return 0
    else:
        return 1
Exemple #29
0
 def getHits_adu(self, runID, ohdu, adu_keV, save=False ):
     print( 'image2hit' )
     path = self.outPath + self.partial_id
     simulatedImage_adu = self.getSimulatedImage_adu( adu_keV )
     if not self.empty:
         simulatedImage_adu += self.reconstructSCNImage_adu( runID, ohdu )
     hits_adu = self.convertImage2Hits( simulatedImage_adu, save = True )
     if save:
         root_numpy.array2root(hits_adu, path, treename='hitSumm', mode='recreate')
     return hits_adu
Exemple #30
0
def save_file(data, pred, proba, filename, model):
    data['isSignal'] = pred
    print(filename)
    data['probSignal'] = proba[:]
    array2root(np.array(data.to_records()),
               'OutputRoot/new_' + model + '_' + filename,
               'nominal',
               mode='recreate')
    print('Save file as {}'.format('new_' + model + '_' + filename))
    return
Exemple #31
0
def test_to_root(folder,result_folder,output_root_folder,variables,is_signal,model_label,sample_list=[]):

    if not os.path.isdir(output_root_folder+'/model_'+model_label): os.mkdir(output_root_folder+'/model_'+model_label)

    if sample_list==[]:
        print("   Empty sample list, will use full sample . . .")
        ##Read test sample
        store = pd.HDFStore(result_folder+'test_score_'+model_label+'.h5')
        df_test = store.select("df")

        for n, a in enumerate(var):
            back = np.array(df_test[a].loc[df_test[is_signal]==0].values, dtype=[(a, np.float64)])
            sign = np.array(df_test[a].loc[df_test[is_signal]==1].values, dtype=[(a, np.float64)])
            print(a," back: ", back)
            print(a," sign: ", sign)
            array2root(back, output_root_folder+'/model_'+model_label+'/test_bkg.root', mode='recreate' if n==0 else 'update')
            array2root(sign, output_root_folder+'/model_'+model_label+'/test_sgn.root', mode='recreate' if n==0 else 'update')
        print("  Signal and background root files written : ", output_root_folder+'/'+model_label+'/test_*.root')

    else:
        full_list = []
        for sl in sample_list:
            full_list += samples[sl]['files']

        for sample in full_list:
            ##Read test sample
            if not os.path.isfile(folder+sample+"_test.h5"):
                print("!!!File ", folder+sample+"_test.h5", " does not exist! Continuing")
                continue

            store = pd.HDFStore(result_folder+sample+"_score_"+model_label+".h5")
            df_test = store.select("df")
            newFile = TFile(output_root_folder+'/model_'+model_label+'/'+sample+'.root', 'recreate')
            newFile.cd()
            for n, a in enumerate(var):
                arr = np.array(df_test[a].values, dtype=[(a, np.float64)])
                #print(a, " values: ", arr)
                #array2root(arr, output_root_folder+'/model_'+model_label+'/'+sample+'.root', mode='update')#mode='recreate' if n==0 else 'update')
                if n==0: skim = array2tree(arr)
                else: array2tree(arr, tree=skim)#mode='recreate' if n==0 else 'update')

            skim.Write()
            ##Recreate c_nEvents histogram
            counter = TH1F("c_nEvents", "Event Counter", 1, 0., 1.)
            counter.Sumw2()
            ##Fill counter histogram with the first entry of c_nEvents
            counter.Fill(0., df_test["c_nEvents"].values[0])
            ##print("counter bin content: ", counter.GetBinContent(1))
            counter.Write()
            newFile.Close()
            #counter.Delete()

            
            print("  Root file written : ", output_root_folder+'/model_'+model_label+'/'+sample+'.root')
Exemple #32
0
    def CreateTestSample(path, **kwargs):
        r"""Creates a :py:mod:`ROOT` file with toy data to be used for tests.

        The output file contains one tree with **nevents** number of entries represented
        by `nbranches` branches. Random numbers for each branch are drawn according to a
        chisquare distribution with a mean indicated by the branch index. The name of
        the output tree is given by **tree** and the branches are of the form
        'branch_1', 'branch_2', ...

        Numbers are generated using the :class:`numpy.random` module and the output file
        is filled using the :func:`root_numpy.array2root` method.

        If a file with the same name already exists it will be overwritten (can be
        changed  with the **overwrite** keyword argument). If **mkdir** is set to
        ``True`` (default: ``False``) directories in **path** with do not yet exist will
        be created automatically.

        :param path: path of output :py:mod:`ROOT` file
        :type path: ``str``

        :param \**kwargs: see below

        :Keyword Arguments:

            * **nevents** (``int``) -- number of events in the output tree (default:
              10000)

            * **nbranches** (``int``) -- number of branches (default: 10)

            * **tree** (``int``) -- name of the output tree (default: 'tree')

            * **overwrite** (``bool``) -- overwrite an existing file located at **path**
              (default: ``True``)

            * **mkdir** (``bool``) -- create non-existing directories in **path**
              (default: ``False``)
        """
        basedir = os.path.abspath(path)
        if not basedir:
            logger.error("Directory '{}' does not exist!".format(basedir))
            raise IOError("Path not found!")
        nevents = int(kwargs.get("nevents", 1e4))
        nbranches = int(kwargs.get("nbranches", 10))
        treename = kwargs.get("tree", "tree")
        array = np.core.records.fromarrays(
            np.transpose(
                np.random.chisquare(range(1, nbranches + 1, 1),
                                    size=(nevents, nbranches))),
            names=",".join(
                ["branch_{}".format(i + 1) for i in range(nbranches)]),
        )
        rnp.array2root(array, path, treename=treename, mode="recreate")
        if os.path.isfile(path):
            logger.info("Created '{}'.".format(path))
Exemple #33
0
def save_file(data, pred, proba, filename, model):
    data['isSignal'] = pred
    print(filename)
    #for index in range(20):
    #    print "Proba {}".format(proba[index,0])
    data['probSignal'] = proba[:, 0]
    array2root(np.array(data.to_records()),
               'OutputRoot/new_BDT_' + model + '_' + filename,
               'nominal',
               mode='recreate')
    return
Exemple #34
0
def add_branch(arr, bname, rfile, tname):
    """
    Add the passed array to an existing TTree in an existing TFile

    Args:
        arr (numpy.array): 1D numpy array that will be stored under a new branch
        bname (str): Branch name for the values
        rfile (str): Filename to which the new branch should be added
        tname (str): Name of the TTree to which the values should be added
    """
    arr = np.array(arr, dtype=[(bname, np.find_common_type(arr, []))])
    array2root(arr, rfile, treename=tname, mode='update')
Exemple #35
0
def test_array2root():
    a = np.array([
        (12345, 2., 2.1, True),
        (3, 4., 4.2, False),],
        dtype=[
            ('x', np.int32),
            ('y', np.float32),
            ('z', np.float64),
            ('w', np.bool)])
    tmp_fd, tmp_path = tempfile.mkstemp(suffix='.root')
    rnp.array2root(a, tmp_path, mode='recreate')
    os.close(tmp_fd)
    os.remove(tmp_path)
Exemple #36
0
def test_array2root():
    a = np.array([
        (12345, 2., 2.1, True),
        (3, 4., 4.2, False),],
        dtype=[
            ('x', np.int32),
            ('y', np.float32),
            ('z', np.float64),
            ('w', np.bool)])
    with temp() as tmp:
        rnp.array2root(a, tmp.GetName(), mode='recreate')
        a_conv = rnp.root2array(tmp.GetName())
        assert_array_equal(a, a_conv)
        # extend the tree
        rnp.array2root(a, tmp.GetName(), mode='update')
        a_conv2 = rnp.root2array(tmp.GetName())
        assert_array_equal(np.hstack([a, a]), a_conv2)
Exemple #37
0
def test_array2root():
    a = np.array([
        (12345, 2., 2.1, True),
        (3, 4., 4.2, False),],
        dtype=[
            ('x', np.int32),
            ('y', np.float32),
            ('z', np.float64),
            ('w', np.bool)])
    tmp_fd, tmp_path = tempfile.mkstemp(suffix='.root')
    rnp.array2root(a, tmp_path, mode='recreate')
    a_conv = rnp.root2array(tmp_path)
    assert_array_equal(a, a_conv)
    # extend the tree
    rnp.array2root(a, tmp_path, mode='update')
    a_conv2 = rnp.root2array(tmp_path)
    assert_array_equal(np.hstack([a, a]), a_conv2)
    os.close(tmp_fd)
    os.remove(tmp_path)
Exemple #38
0
def to_root(df, path, key='default', mode='w', *args, **kwargs):
    """
    Write DataFrame to a ROOT file.

    Parameters
    ----------
    path: string
        File path to new ROOT file (will be overwritten)
    key: string
        Name of tree that the DataFrame will be saved as
    mode: string, {'w', 'a'}
        Mode that the file should be opened in (default: 'w')

    Notes
    -----

    Further *args and *kwargs are passed to root_numpy's array2root.

    >>> df = DataFrame({'x': [1,2,3], 'y': [4,5,6]})
    >>> df.to_root('test.root')

    The DataFrame index will be saved as a branch called '__index__*',
    where * is the name of the index in the original DataFrame
    """

    if mode == 'a':
        mode = 'update'
    elif mode == 'w':
        mode = 'recreate'
    else:
        raise ValueError('Unknown mode: {}. Must be "a" or "w".'.format(mode))

    from root_numpy import array2root
    # We don't want to modify the user's DataFrame here, so we make a shallow copy
    df_ = df.copy(deep=False)
    name = df_.index.name
    if name is None:
        # Handle the case where the index has no name
        name = ''
    df_['__index__' + name] = df_.index
    arr = df_.to_records(index=False)
    array2root(arr, path, key, mode=mode, *args, **kwargs)
Exemple #39
0
def test_array2root():
    a = np.array([
        (12345, 2., 2.1, True),
        (3, 4., 4.2, False),],
        dtype=[
            ('x', np.int32),
            ('y', np.float32),
            ('z', np.float64),
            ('w', np.bool)])
    with temp() as tmp:
        rnp.array2root(a, tmp.GetName(), mode='recreate')
        a_conv = rnp.root2array(tmp.GetName())
        assert_array_equal(a, a_conv)
        # extend the tree
        rnp.array2root(a, tmp.GetName(), mode='update')
        a_conv2 = rnp.root2array(tmp.GetName())
        assert_array_equal(np.hstack([a, a]), a_conv2)
        # write into subdirectory
        tname = 'root/sub/tree'
        rnp.array2root(a, tmp.GetName(), treename=tname, mode='update')
        a_conv3 = rnp.root2array(tmp.GetName(), treename=tname)
        assert_array_equal(a, a_conv3)
        # try creating tree with conflicting name
        assert_raises(IOError, rnp.array2root, a, tmp.GetName(),
                treename='root/sub', mode='update')
        # try creating subdirectory with conflicting name
        assert_raises(IOError, rnp.array2root, a, tmp.GetName(),
                treename='root/sub/tree/error', mode='update')
Exemple #40
0
    def save_data(self, metadata, data):
        if self.treemaker.uses_arrays:
            # Activate Joey's array saving code
            dataframe_to_root(data, self.path, treename=self.treemaker.__name__, mode='recreate')

        else:
            # Check we really aren't using arrays, otherwise we'll crash with a very uninformative message
            for branch_name in data.columns:
                if is_array_field(data, branch_name):
                    raise TypeError("Column %s is an array field, and you want to save to root. Either "
                                    "(1) use MultipleRowExtractor-based minitrees; or "
                                    "(2) add a uses_arrays=True attribute to the %s class; or "
                                    "(3) use pickle as your minitree format." % (branch_name,
                                                                                 self.treemaker.__class__.__name__))
            root_numpy.array2root(data.to_records(), self.path,
                                  treename=self.treemaker.__name__, mode='recreate')

        # Add metadata as JSON in a TNamed in the same ROOT file
        bla = ROOT.TNamed('metadata', json.dumps(metadata))
        minitree_f = ROOT.TFile(self.path, 'UPDATE')
        bla.Write()
        minitree_f.Close()
Exemple #41
0
    def _fit(self, X, y, sample_weight=None, features_names=None, model_type='classification'):
        """
        Train the classifier

        :param pandas.DataFrame X: data shape [n_samples, n_features]
        :param list | numpy.array y: values - array-like of shape [n_samples]
        :param list | numpy.array sample_weight: weight of events,
               array-like of shape [n_samples] or None if all weights are equal
        :return: self
        """
        # saving data to 2 different root files.
        directory = self._create_tmp_directory()
        add_info = _AdditionalInformation(directory, features_names, model_type=model_type)
        try:
            X[add_info.weight_column] = sample_weight
            X[add_info.target_column] = y
            root_numpy.array2root(X.to_records(), filename=add_info.filename,
                                  treename=add_info.treename)
            self._run_tmva_training(add_info)
        finally:
            self._remove_tmp_directory(directory)

        return self
Exemple #42
0
    def _predict(self, X, features_names=None, model_type=('classification', None)):
        """
        Predict data

        :param pandas.DataFrame X: data shape [n_samples, n_features]
        :return: predicted values of shape n_samples
        """
        self._check_fitted()

        directory = self._create_tmp_directory()
        try:
            with tempfile.NamedTemporaryFile(mode="w", suffix='.xml', dir=directory, delete=True) as file_xml:
                file_xml.write(self.formula_xml)
                file_xml.flush()
                add_info = _AdditionalInformationPredict(directory, file_xml.name, features_names, self._method_name,
                                                         model_type=model_type)
                root_numpy.array2root(X.astype(numpy.float32).to_records(), filename=add_info.filename,
                                      treename=add_info.treename)
                prediction = self._run_tmva_predict(add_info)
        finally:
            self._remove_tmp_directory(directory)

        return prediction
Exemple #43
0
    def write_output(self,outfile_name,ROOT=True,pickle=False):
        '''
        Converts outputed event data into re-usable data format,
        either ROOT or a pickled numpy record array

        Input:
        -outfile_name: string, name of outfile to write to (without extension)
        -ROOT: bool, True to write a ROOT file
        -pickle: bool, True to write a pickle file
        '''
        # build record array of outputed event dicts
        # assume all dict's are the same structure
        dt=[]
        for item in self.events[0]:
            if type(self.events[0][item])==numpy.ndarray:
                dt+=[(item,type(self.events[0][item][0]),len(self.events[0][item]))]
            else:
                dt+=[(item,type(self.events[0][item]))]

        dt=numpy.dtype(dt)
        values=[tuple(each.values()) for each in self.events]
        out=numpy.zeros((len(self.events),),dtype=dt)
        out[:]=values

        # convert record array to root tree, write to file
        if ROOT:
            if self.logging:
                print "Creating file %s.root" % (outfile_name)
            root_numpy.array2root(out,'%s.root' % (outfile_name))
        # write record array to pickle file
        if pickle:
            if self.logging:
                print "Creating file %s.pickle" % (outfile_name)
            f=open('%s.pickle' % (outfile_name),'w')
            pickle.dump(out,f)
            f.close()
Exemple #44
0
print "GAMMA2 done..."
root['kappa'] = bcc['KAPPA']
print "KAPPA done..."
root['size'] = bcc['SIZE']
print "SIZE done..."
root['eps1'] = bcc["EPSILON"][0:,0]
print "EPSILON 1 done..."
root['eps2'] = bcc["EPSILON"][0:,1]
print "EPSILON 2 done..."
root["mag"] = bcc["TMAG"][0:,2]
print "TMAG done..."
root["teps1"] = bcc["TE"][0:,0]
print "TEPS1 done..."
root["teps2"] =bcc["TE"][0:,1]
print "TEPS2 done..."
root["tra"] = bcc["TRA"]
print "TRA done..."
root["tdec"] = bcc["TDEC"]
print "TDEC done..."
root["tsize"] = bcc["TSIZE"]
print "TSIZE done..."
root["mu"] = bcc["MU"]

print "All Done !"

array2root(root,output,'bcc')




from root_numpy import root2array, rec2array, array2root

bdt_file = '/lustre/cmswork/hh/mvas/xgboost/train_3CSVM_0.5sig_0.7bkg_weighted.pkl'

branch_names = ["H1_pT", "H2_pT",
                "H1_dEta_abs", "H2_dEta_abs",
                "H1_dPhi_abs", "H2_dPhi_abs"]

# compute bdt values
bdt = joblib.load(bdt_file)

for root_file in args.root_files:
    print "processing {}".format(root_file)
    # load vars data from ROOT
    data = root2array(root_file, args.tree_name, branch_names)

    data_bdt = bdt.predict_proba(rec2array(data[branch_names]))[:,1]

    # save to ROOT file
    data_bdt.dtype = [(args.bdt_name, np.float32)]
    array2root(data_bdt, root_file, "tree")









Exemple #46
0
                               range=(min_value, max_value), label='Signal SM', **hist_params)
    areaSig = sum(np.diff(bins)*values)
    
    #print areaBKG, " ",areaBKG2 ," ",areaSig
    if n == 0 : plt.legend(loc='best')
    plt.title(feature)
plt.savefig("Variables_"+subset+BKG+"_benchmarks_"+ext)
plt.clf()
"""
#################################################################################
### Define classifiers to test
traindataset, valdataset = train_test_split(dataset, random_state=11, train_size=0.50)
traindatasetmix, valdatasetmix = train_test_split(datasetmix, random_state=11, train_size=0.50)
#################################################################################
arr = valdatasetmix.to_records()
array2root(arr, outputCentral+"_AppliedToMixed"+typedata+".root" , 'tree', 'recreate') 
arr = dataset.to_records()
array2root(arr, outputCentral+"_AppliedToPlain"+typedata+".root" , 'tree', 'recreate')
if typedata=="Data": 
  arr = dataset20.to_records()
  array2root(arr, outputCentral+"_AppliedTo20pOfPlain"+typedata+".root" , 'tree', 'recreate')
#

for ii in range(0,3):
   if ii==0 :
     train= trainFeaturesplot
     Var='All'
   if ii==1 :
     train= trainFeaturesObvious
     Var='Mass'
   if ii==2 :
import numpy as np
import root_numpy as rnp

import postprocessing.ttree
import ROOT

# Gets a TTree from a ROOT file.
signal_file = ROOT.TFile("ROOT_data/signal1MTraining.root")
signal_tree = signal_file.Get("h101;1")

# Converts the TTree to a NumPy structured array.
array = rnp.tree2array(signal_tree)

# Generates three new branches to be added to the array.
classifier_branch = postprocessing.ttree.name_classifier_branches(
    np.random.rand(247015, 3), ['ClassA', 'ClassB', 'ClassC'])

# Attaches the new branches to the original array.
together = postprocessing.ttree.join_struct_arrays([array, classifier_branch])

# Outputs the total array as a ROOT file.
rnp.array2root(together, "ROOT_data/together.root")
Exemple #48
0
import sys
import numpy as np
from root_numpy import array2root
print sys.argv[1]
# need to find out how many columns are in the file

f = open(sys.argv[1])
l = f.readline()
colcount = l.count(',')
f.close()

cols = np.linspace(1,colcount,colcount,dtype=int)
data = np.genfromtxt(sys.argv[1],delimiter=',',names=True,usecols=cols)

array2root(data, sys.argv[1].replace('.csv','.root'),'outputTree')
            fname, "ttree", Output_variables, None, 0, nfiles_per_sample, skip_n_events, False, "weight"
        )
        Output_tree = rootnp.rec2array(Output_tree)

        Output_tree_final = np.ndarray(
            (Output_tree.shape[0],),
            dtype=[
                ("Jet_flavour", float),
                ("TagVarCSV_vertexCategory", float),
                ("Jet_pt", float),
                ("Jet_eta", float),
                ("Jet_CSVIVF", float),
                ("BDTG", float),
            ],
        )  # , buffer = np.array([1,2,3,4,5]))
        for idx, val in enumerate(BDTG):
            Output_tree_final[idx][0] = Output_tree[idx][0]
            Output_tree_final[idx][1] = Output_tree[idx][1]
            Output_tree_final[idx][2] = Output_tree[idx][2]
            Output_tree_final[idx][3] = Output_tree[idx][3]
            Output_tree_final[idx][4] = Output_tree[idx][4]
            Output_tree_final[idx][5] = BDTG[idx]

        Output_tree_final = Output_tree_final.view(np.recarray)
        tree = rootnp.array2root(
            Output_tree_final, "trainPlusBDTG_CombinedSV" + category + "_" + flavor + ".root", "ttree", "recreate"
        )
        log.info("Output file dumped in trainPlusBDTG_CombinedSV" + category + "_" + flavor + ".root")

log.info("done")
Exemple #50
0
    if events[i]["event"]!=currentevent:
        if cutType=="bumphunt":
            candidates.sort(key=lambda x:events[x]["tarChisq"],reverse=False)
        elif cutType=="vertexing":
            candidates.sort(key=lambda x:events[x]["bscChisq"],reverse=False)
        elif cutType=="none":
            candidates.sort(key=lambda x:events[x]["tarChisq"],reverse=False)
        else:
            raise Exception("invalid cut type")
#        ranked_candidates = sorted(candidates, key=lambda x:events[x][sortkey],reverse=highestBest)
        rank=1
        for j in candidates:
            output[j]["nPass"]=len(candidates)
            output[j]["rank"]=rank
            rank+=1
        del candidates[:]
        currentevent = events[i]["event"]
    if output[i]["cut"]!=0:
        candidates.append(i)

if cutOutput:
    output = output[output["cut"]!=0]
if onlyBest:
    output = output[output["rank"]==1]
if onlyOnly:
    output = output[output["nPass"]==1]

root_numpy.array2root(output,remainder[0],mode="recreate",treename="cut")
#newtree=root_numpy.array2tree(output)
#newtree.Scan()
Exemple #51
0
def savedata(dt, dt_LE, basename, clf=None, dt_real=None):
    
    # Get the data to write
    dt_out = dt.data
    labels = dt.treenames + dt.w_varnames + m_weightnames

    # Also take care of the low energy data
    dt_LE_out = dt_LE.data

    # If we have real data, save as well
    if dt_real != None:
        dt_real_out = dt_real.data

    # if a classifier is passed, then add that to the data field
    if clf != None:
        scores = clf.decision_function(dt.getDataNoWeight())
        scores = scores.reshape((len(scores),1))
        dt_out = np.concatenate((dt_out, scores),axis=1)
        labels += ['score']

        LE_scores = clf.decision_function(dt_LE.getDataNoWeight())
        LE_scores = LE_scores.reshape((len(LE_scores),1))
        dt_LE_out = np.concatenate((dt_LE_out, LE_scores),axis=1)

        if dt_real != None:
            real_scores = clf.decision_function(dt_real.getDataNoWeight())
            real_scores = real_scores.reshape((len(real_scores),1))
            dt_real_out   = np.concatenate((dt_real_out,real_scores),1) 
        
    csl = ""
    for i in range(len(labels)-1):
        csl += labels[i] + ","
    csl += labels[-1]

    # Separate the data into signal and background
    dt_out_sig = dt_out[ dt.targets > 0.5 ]
    dt_out_bkg = dt_out[ dt.targets < 0.5 ]

    # Turn into record array
    dt_out_sig = np.rec.fromrecords(dt_out_sig, names=csl)
    dt_out_bkg = np.rec.fromrecords(dt_out_bkg, names=csl)
    dt_LE_out  = np.rec.fromrecords(dt_LE_out, names=csl)

    if dt_real != None:
        dt_real_out = np.rec.fromrecords(dt_real_out, names=csl)

    for wn in m_weightnames:
        dt_out_sig[wn] *= dt.sf
        dt_out_bkg[wn] *= dt.sf
        dt_LE_out[wn]  *= dt_LE.sf

    # Convert directly to a root file
    signame   = 'processed_trees/' + basename + '_sig.root'
    bkgname   = 'processed_trees/' + basename + '_bkg.root'
    dataname  = 'processed_trees/' + basename + '_data.root'

    array2root(dt_out_sig, signame, 'tree','recreate')
    array2root(dt_out_bkg, bkgname, 'tree','recreate')

    # Put LE sig into the same file
    array2root(dt_LE_out, signame, 'tree')

    # Save real data if added
    if dt_real != None:
        array2root(dt_real_out, dataname, 'tree','recreate')
def save_array(outputArray, outputName):
	#array = np.savetxt(outputName+".txt", outputArray, fmt='%.4e',delimiter = "|")
	outputString = str(outputName)
	logging.info("Creating .Root file")
	rnp.array2root(outputArray,outputString,treename='Training_Variables',mode='recreate')
Exemple #53
0
# Convert to structured array
ntuple_list = df_ntuple.as_matrix().tolist()
ntuple_list = [tuple(a) for a in ntuple_list]

name_type_ref = [('BDT', 'f4'),
                 ('Class', 'i4'),
                 ('EventNumber', 'i4'),
                 ('EventWeight', 'f4'),
                 ('MET', 'f4'),
                 ('Mtop', 'f4'),
                 ('dPhiLBmin', 'f4'),
                 ('dPhiVBB', 'f4'),
                 ('dRBB', 'f4'),
                 ('dYWH', 'f4'),
                 ('mBB', 'f4'),
                 ('mBBJ', 'f4'),
                 ('mTW', 'f4'),
                 ('nTags', 'i4'),
                 ('nJ', 'i4'),
                 ('pTB1', 'f4'),
                 ('pTB2', 'f4'),
                 ('pTJ3', 'f4'),
                 ('pTV', 'f4'),
                 ('sample', 'S15')]

ntuple_array = np.array(ntuple_list, dtype=name_type_ref)

# Write to ROOT file.
array2root(ntuple_array, '/Volumes/THUMB/VHbb-data/write/skl_BDT_results.root',
           mode='recreate')
        t = f.NStations
        nentries = t.GetEntriesFast()
        cut = np.zeros(nentries, dtype='bool')
        for i in range(nentries):
            t.GetEntry(i)
            rootID = '%s_%s_%s' % (t.Run, t.Event, t.SubEvent)
            if rootID in eventID:
                cut[i] = True
        f.Close()

        ## WRITE TO FILE ##
        # Most likely composition
        try:
            values = np.zeros(nentries, dtype=[('comp', 'S1')])
            values[cut] = d['llh_comp'][:]
            root_numpy.array2root(values, outFile, 'llh_comp', 'recreate')
        except ValueError:
            print 'Length mismatch. Skipping...'
            continue

        # Most likely energy
        values = np.zeros(nentries, dtype=[('energy', float)])
        values[cut] = d['ML_energy'][:]
        root_numpy.array2root(values, outFile, 'ML_energy')

        # Likelihoods
        keys = ['pLLH', 'hLLH', 'oLLH', 'fLLH']
        for key in keys:
            values = np.zeros(nentries, dtype=[('llh', float)])
            values[cut] = d[key][:]
            root_numpy.array2root(values, outFile, key)
   pool_files.append(extfile)
   nfiles_per_sample = None
   X_val = rootnp.root2array(extfile.path,'tree',variables,None,0,nfiles_per_sample,args.testEvery,False,'weight')
   X_val = rootnp.rec2array(X_val)
   BDTG =  clf_val.predict_proba(X_val)[:,1]
   if (args.TMVAOut):
   	BDTG = [i*2-1 for i in BDTG]
   	
   Output_variables = ['flavour','vertexCategory','jetPt','jetEta']
   Output_tree = rootnp.root2array(extfile.path,'tree',Output_variables,None,0,nfiles_per_sample,args.testEvery,False,'weight')
   Output_tree = rootnp.rec2array(Output_tree)

   Output_tree_final = np.ndarray((Output_tree.shape[0],),dtype=[('flavour', float), ('vertexCategory', float), ('jetPt', float), ('jetEta', float), ('BDTG', float)])#, buffer = np.array([1,2,3,4,5]))
   for idx,val in enumerate(BDTG):
    Output_tree_final[idx][0] = Output_tree[idx][0]
    Output_tree_final[idx][1] = Output_tree[idx][1]
    Output_tree_final[idx][2] = Output_tree[idx][2]
    Output_tree_final[idx][3] = Output_tree[idx][3]
    Output_tree_final[idx][4] = BDTG[idx]
    
   Output_tree_final = Output_tree_final.view(np.recarray)
   outname = 'trainPlusBDTG_CombinedSV%s_%s.root' % (category, flavor) if not args.batch else \
      '%strainPlusBDTG_CombinedSV%s_%s.root' % (args.trainingTag, category, flavor)
   outfile = os.path.join(dirpath, outname)
   tree = rootnp.array2root(Output_tree_final, outfile, 'tree')
   with io.root_open(outfile, 'update') as tout:
      tout.WriteTObject(watermark, 'watermark')
      tout.WriteTObject(codemark, 'codemark')
   log.info('Output file dumped in %s' % outfile)
log.info('done')
Exemple #56
0
def analyse(bdt_models, bdt_taggers, dnn_models, dnn_taggers, dnn_scaler, data_files):
    # using bdt_model
    # bdt_model.predict_proba(data)

    # need to scale the data to use the dnn
    # for i, v in enumerate(scaler.variables):
    #     data[v] = (data[v] - scaler.means[i]) / scaler.std[i]
    # This can be done on an event by event basis too
    # event.variable = event.variable - means[i]/ std[i]
    
    # using dnn_model
    # dnn_model.predict(data)[0]

    # The variables have different names in data, so we need to map the variable names to something the
    # models can use/ have the same names.
    bdt_var_map = {'Aplanarity':'jetTrimX_aplanarity', 'EEC_C2_1':'jetTrimX_c2beta1', 'EEC_D2_1':'jetTrimX_d2beta1', 'Sphericity':'jetTrimX_sphericity', 'SPLIT12': 'jetTrimX_groosplit12', 'TauWTA1':'jetTrimX_grootau1','TauWTA2':'jetTrimX_grootau2', 'TauWTA2TauWTA1':'jetTrimX_grootau21', 'Mu12':'jetTrimX_mufilt', 'yfilt': 'jetTrimX_ysfilt', 'y':'jetTrimX_y', 'nTracks':'jetTrimX_ungrngtrk', 'PlanarFlow': 'jetTrimX_planarflow'}
    # gotta get the order right for the variables
    bdt_vars_1 = []
    bdt_vars_2 = []
    for b in bdt_taggers:
        bdt_vars_1.append(bdt_var_map[b].replace('X','1'))
        bdt_vars_2.append(bdt_var_map[b].replace('X','2'))
    bdt_ivd = {}
    for k, v in bdt_var_map.items():
        bdt_ivd[v.replace('X','1')] = k
        bdt_ivd[v.replace('X','2')] = k

    
    dnn_var_map = {'aplanarity':'jetTrimX_aplanarity', 'eec_c2_1':'jetTrimX_c2beta1','eec_d2_1':'jetTrimX_d2beta1', 'sphericity':'jetTrimX_sphericity', 'split12': 'jetTrimX_groosplit12', 'tauwta1':'jetTrimX_grootau1','tauwta2':'jetTrimX_grootau2', 'tauwta2tauwta1':'jetTrimX_grootau21', 'mu12':'jetTrimX_mufilt', 'yfilt': 'jetTrimX_ysfilt', 'y':'jetTrimX_y', 'ntracks':'jetTrimX_ungrngtrk', 'planarflow':'jetTrimX_planarflow'}
    # gotta get the order right for the variables
    dnn_vars_1 = []
    dnn_vars_2 = []
    print 'dnn_taggers'
    print dnn_taggers
    for b in dnn_taggers:
        dnn_vars_1.append(dnn_var_map[b].replace('X','1'))
        dnn_vars_2.append(dnn_var_map[b].replace('X','2'))

    dnn_ivd = {}
    for k, v in dnn_var_map.items():
        dnn_ivd[v.replace('X','1')] = k
        dnn_ivd[v.replace('X','2')] = k

        
    # read in the data so we can analyse it!
    # gotta do it one at a time for each data file :(
    for data_file in data_files:
        # benchmark and try a few options
        # first read in data using root2array
        print data_file
        data_arr = rn.root2rec(data_file)

        # unfortunately, some of the values in the ntuples are NaNs!!!
        # keep track of which ones these are...
        nan_idx_tmp = np.empty([0])
        no_values_tmp = np.empty([0])
        for d in data_arr.dtype.names:
            if d not in dnn_vars_1 and d not in dnn_vars_2:
                continue

            if np.any(np.isnan(data_arr[d])) or not np.all(np.isfinite(data_arr[d])):
                if len(nan_idx_tmp) == 0:
                    nan_idx_tmp = np.asarray(np.where(np.isnan(data_arr[d])))[0]
                else:
                    nan_idx_tmp = np.concatenate((nan_idx_tmp, np.asarray(np.where(np.isnan(data_arr[d])))[0]))
                data_arr[d] = np.nan_to_num(data_arr[d])
                
            if len(no_values_tmp) == 0:
                no_values_tmp = np.asarray(np.where(data_arr[d]<-98))[0]
            else:
                no_values_tmp = np.concatenate((no_values_tmp, np.asarray(np.where(data_arr[d]<-98))[0]))
        
        # concatenate
        nan_idx = np.unique(nan_idx_tmp)
        no_values = np.unique(no_values_tmp)

        data_arr['jetTrim1_groosplit12'] = data_arr['jetTrim1_groosplit12']/1000.
        data_arr['jetTrim2_groosplit12'] = data_arr['jetTrim2_groosplit12']/1000.
        # get the columns for classifying for jet1
        bdt_data = data_arr[bdt_vars_1]
        dnn_data = data_arr[dnn_vars_1]
        # get the columns for classifying for jet2
        bdt_data_2 = data_arr[bdt_vars_2]
        dnn_data_2 = data_arr[dnn_vars_2]

        # do we have to rename the data fields?
        # easy enough to rename if the fields
        recs = []

        bdt_data.dtype.names = [bdt_ivd[d] for d in bdt_data.dtype.names]
        bdt_data_2.dtype.names = [bdt_ivd[d] for d in bdt_data_2.dtype.names]

        for d in bdt_data.dtype.names:
            if d != 'nTracks' and d.find('trk') == -1:
                recs.append((d, 'float'))
            else:
                recs.append((d, 'int'))
            #print np.any(np.isnan(bdt_data[d]))
            #print np.all(np.isfinite(bdt_data[d]))
            #print np.where(np.isnan(bdt_data[d]))


        bdt_data_arr = bdt_data.view(np.float32).reshape(bdt_data.shape + (-1,))
        bdt_data_arr_2 = bdt_data_2.view(np.float32).reshape(bdt_data_2.shape + (-1,))

        bdt_proba = None
        bdt_proba_2 = None
        for m in bdt_models:
            if bdt_proba is not None:
                bdt_proba += m.predict_proba(bdt_data_arr)[:,1]
                bdt_proba_2 += m.predict_proba(bdt_data_arr_2)[:,1]
            else:
                bdt_proba = m.predict_proba(bdt_data_arr)[:,1]
                bdt_proba_2 = m.predict_proba(bdt_data_arr_2)[:,1]
            # scale data
        bdt_proba/=float(len(bdt_models))
        bdt_proba_2/=float(len(bdt_models))
        
        
        for i, v in enumerate(dnn_scaler.variables):
            # reverse lookup for v
            #print v
            if v in dnn_var_map.keys():
                #print 'found v in dnn_var_map'
                #print dnn_var_map[v]
                if dnn_var_map[v].replace('X','1') in dnn_data.dtype.names:
                    '''
                    print v
                    print 'means and std'
                    print dnn_scaler.means[i]
                    print dnn_scaler.std[i]
                    print np.mean(dnn_data[dnn_var_map[v].replace('X','1')])
                    '''
                    dnn_data[dnn_var_map[v].replace('X','1')] = (dnn_data[dnn_var_map[v].replace('X','1')] - dnn_scaler.means[i]) / dnn_scaler.std[i]
                    dnn_data_2[dnn_var_map[v].replace('X','2')] = (dnn_data_2[dnn_var_map[v].replace('X','2')] - dnn_scaler.means[i]) / dnn_scaler.std[i]

        dnn_data.dtype.names = [dnn_ivd[d] for d in dnn_data.dtype.names]
        dnn_data_2.dtype.names = [dnn_ivd[d] for d in dnn_data_2.dtype.names]
            
        # do we have to rename the data fields to get this to work?
        dnn_predictions = None
        for m in dnn_models:
            if dnn_predictions is not None:
                dnn_predict1 = m.predict(dnn_data)[0]
                dnn_predict1.dtype.names = ['jetTrim1_dnn']
                dnn_predict2 = m.predict(dnn_data_2)[0]
                dnn_predict2.dtype.names = ['jetTrim2_dnn']
                for n in xrange(len(dnn_predict1['jetTrim1_dnn'])):
                    dnn_predictions['jetTrim1_dnn'][n] += dnn_predict1['jetTrim1_dnn'][n]
                    dnn_predictions_2['jetTrim2_dnn'][n] += dnn_predict2['jetTrim2_dnn'][n]
            else:
                dnn_predictions = m.predict(dnn_data)[0]
                dnn_predictions.dtype.names = ['jetTrim1_dnn']
                dnn_predictions_2 = m.predict(dnn_data_2)[0]
                dnn_predictions_2.dtype.names = ['jetTrim2_dnn']

        for n in xrange(len(dnn_predictions['jetTrim1_dnn'])):
            dnn_predictions['jetTrim1_dnn'][n] /= float(len(dnn_models))
            dnn_predictions_2['jetTrim2_dnn'][n] /= float(len(dnn_models))
        # set all of the nan index ones to zero
        for n in np.unique(np.concatenate((no_values, nan_idx))):#nan_idx:
            dnn_predictions['jetTrim1_dnn'][n] = 0
            dnn_predictions_2['jetTrim2_dnn'][n] = 0
            bdt_proba[n] = 0
            bdt_proba_2[n] = 0
        data_arr['jetTrim1_groosplit12'] = data_arr['jetTrim1_groosplit12']*1000.
        data_arr['jetTrim2_groosplit12']*=1000.
        # have to do this annoying .copy() to be able to add the dtype.names to any
        # arrays that come from a slice.
        #bdt_ = X.copy().view(dtype=[(n, np.float64) for n in variables]).reshape(len(X))
        # now add them to the data file and write it out
        #data_scored = nf.append_fields(data_arr, names=['jetTrim1_bdt','jetTrim2_bdt','jetTrim1_dnn','jetTrim2_dnn'], data=[bdt_proba, bdt_proba_2, dnn_predictions, dnn_predictions_2], usemask = False)
        data_scored = nf.append_fields(data_arr, names=['jetTrim1_bdt','jetTrim2_bdt','jetTrim1_dnn','jetTrim2_dnn'], data=[bdt_proba, bdt_proba_2, dnn_predictions, dnn_predictions_2], usemask = False)
        rn.array2root(data_scored, data_file.replace('.root','_scored_nonTrk_avg_vTest.root'),'dibjet','recreate')
Exemple #57
0
def table_to_root(table, filename, **kwargs):
    """Write a Table to a ROOT file
    """
    import root_numpy
    root_numpy.array2root(table.as_array(), filename, **kwargs)