Python rec2array Examples, root_numpy.rec2array Python Examples

Example #1

0

Show file

File: fit_xsec_ttbb_NN.py Project: smoortga/ttbbFourFermionEFT

def NN_validate(filename, class_number=1, cut=0., original_n_events=20000):
    X = rootnp.root2array(args.ValidationDir + "/" + filename[0], "tree")
    X = rootnp.rec2array(X)
    for i in range(len(filename)):
        if i == 0: continue
        X_ = rootnp.root2array(args.ValidationDir + "/" + filename[i], "tree")
        X_ = rootnp.rec2array(X_)
        X = np.concatenate((X, X_))
    model = load_model(args.TrainingFile)
    scaler = pickle.load(open(args.ScalerFile, 'r'))
    X = scaler.transform(X)
    if class_number == -1:
        coupling_name = filename[0].split("_")[0]
        #print coupling_name
        coupling_class = classes_dict[coupling_name]
        discr_dict = {}
        for class_n in set(i for j, i in classes_dict.iteritems()):
            discr_dict[class_n] = model.predict(X)[:, class_n]
        #discr = np.asarray([j for jdx,j in enumerate(discr_dict[coupling_class])])
        discr = np.asarray([
            j / (discr_dict[0][jdx] + discr_dict[coupling_class][jdx])
            for jdx, j in enumerate(discr_dict[coupling_class])
        ])
        #discr = np.asarray([(discr_dict[1][jdx]+discr_dict[2][jdx]) for jdx,j in enumerate(discr_dict[1])])
    else:
        discr = model.predict(X)[:, class_number]
    nEvents = len(discr)
    print float(len(discr)), sum_original_n_events, 100 * float(
        len(discr)) / float(original_n_events), "%"
    discr = discr[discr >= cut]
    print "selection efficiency NN cut: ", 100 * float(
        len(discr)) / float(nEvents)
    #print ""
    #print filename, float(len(discr)),"/",float(len(filename)*original_n_events),"%"
    return float(len(discr)) / float(original_n_events)

Example #2

0

Show file

File: classify.py Project: mayoub/hhana

def make_dataset(signals, backgrounds,
                 category, region, fields,
                 cuts=None):
    signal_arrs = []
    signal_weight_arrs = []
    background_arrs = []
    background_weight_arrs = []

    for signal in signals:
        rec = signal.merged_records(
            category=category,
            region=region,
            fields=fields,
            cuts=cuts)
        signal_weight_arrs.append(rec['weight'])
        signal_arrs.append(rec2array(rec, fields))

    for background in backgrounds:
        rec = background.merged_records(
            category=category,
            region=region,
            fields=fields,
            cuts=cuts)
        background_weight_arrs.append(rec['weight'])
        background_arrs.append(rec2array(rec, fields))

    signal_array = np.concatenate(signal_arrs)
    signal_weight_array = np.concatenate(signal_weight_arrs)
    background_array = np.concatenate(background_arrs)
    background_weight_array = np.concatenate(background_weight_arrs)

    return (signal_array, signal_weight_array,
            background_array, background_weight_array)

Example #3

0

Show file

File: Classes.py Project: gsaha009/HHbbWWAnalysis

    def AddToROC(self,filename):
        """ 
        Info of the root file, the name of the probability branches and the target (0 or 1 or ...)
        """
        valid_file = False
        for key,value in self.selector.items():
            if key in os.path.basename(filename): 
                target = value
                valid_file = True
        if not valid_file: return False# If file not to be taken into account
 
        # Get the output prob #
        if self.weight_name and self.weight_name!='':
            probs = rec2array(root2array(filename,self.tree,branches=self.prob_branches+[self.weight_name],selection=self.cut))
            self.prob_per_class = np.concatenate((self.prob_per_class,probs[:,:-1]),axis=0)
            self.weight = np.concatenate((self.weight,probs[:,-1].reshape(-1,1)),axis=0)
        else:
            probs = rec2array(root2array(filename,self.tree,branches=self.prob_branches,selection=self.cut))
            self.prob_per_class = np.concatenate((self.prob_per_class,probs),axis=0)
            self.weight = None

        # Make the targets labelized #
        target_arr = self.lb.transform([target]*probs.shape[0])
        
        # eg target = 1 and classes = [0,1,2] => scores = [0,1,0],...
        self.scores = np.concatenate((self.scores,target_arr),axis=0)

        return True

Example #4

0

Show file

File: tests.py Project: pablodecm/root_numpy

def test_rec2array():
    a = np.array([
        (12345, 2., 2.1, True),
        (3, 4., 4.2, False),
    ],
                 dtype=[('x', np.int32), ('y', np.float32), ('z', np.float64),
                        ('w', np.bool)])
    arr = rnp.rec2array(a)
    assert_array_equal(arr, np.array([[12345, 2, 2.1, 1], [3, 4, 4.2, 0]]))
    arr = rnp.rec2array(a, fields=['x', 'y'])
    assert_array_equal(arr, np.array([[12345, 2], [3, 4]]))
    # single field
    arr = rnp.rec2array(a, fields=['x'])
    assert_equal(arr.ndim, 1)
    assert_equal(arr.shape, (a.shape[0], ))
    # array fields
    a = np.array([
        (
            [1, 2, 3],
            [4.5, 6, 9.5],
        ),
        (
            [4, 5, 6],
            [3.3, 7.5, 8.4],
        ),
    ],
                 dtype=[('x', np.int32, (3, )), ('y', np.float32, (3, ))])
    arr = rnp.rec2array(a)
    assert_array_almost_equal(
        arr,
        np.array([[[1, 4.5], [
            2,
            6,
        ], [3, 9.5]], [[4, 3.3], [5, 7.5], [6, 8.4]]]))

Example #5

0

Show file

File: functionsBDT.py Project: lecdawson/BDTAnalysis

def import_data():
    signal = root2array(BASE_PATH + "0nubb/sensitivity_0nubb_1E7_Pre_Cut.root",
                        "Sensitivity", BRANCH_NAMES_TRAIN)
    signal = rec2array(signal)

    bkg2nu = root2array(BASE_PATH + "2nubb/sensitivity_2nubb_2E8_Pre_Cut.root",
                        "Sensitivity", BRANCH_NAMES_TRAIN)
    bkg2nu = rec2array(bkg2nu)

    bkg214Bi = root2array(
        BASE_PATH + "Bi214/sensitivity_Bi214_Foils_2E8_Pre_Cut.root",
        "Sensitivity", BRANCH_NAMES_TRAIN)
    bkg214Bi = rec2array(bkg214Bi)

    bkg208Tl = root2array(
        BASE_PATH + "Tl208/sensitivity_Tl208_Foils_2E8_Pre_Cut.root",
        "Sensitivity", BRANCH_NAMES_TRAIN)
    bkg208Tl = rec2array(bkg208Tl)

    bkgRn = root2array(
        BASE_PATH + "Radon/sensitivity_Bi214_Wires_2E8_Pre_Cut.root",
        "Sensitivity", BRANCH_NAMES_TRAIN)
    bkgRn = rec2array(bkgRn)

    return signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn

Example #6

0

Show file

def tvars(rootfile, first, last):
    stringa=["seed_pt","seed_eta","seed_phi","seed_mass","seed_dz","seed_dxy",
             "seed_3D_ip","seed_3D_sip","seed_2D_ip","seed_2D_sip","seed_3D_signedIp","seed_3D_signedSip","seed_2D_signedIp","seed_2D_signedSip",
             "seed_chi2reduced","seed_nPixelHits","seed_nHits","seed_jetAxisDistance","seed_jetAxisDlength"    ]
    stringa2=["nearTracks_pt","nearTracks_eta","nearTracks_phi","nearTracks_dz","nearTracks_dxy","nearTracks_mass","nearTracks_3D_ip","nearTracks_3D_sip",
          "nearTracks_2D_ip","nearTracks_2D_sip","nearTracks_PCAdist","nearTracks_PCAdsig","nearTracks_PCAonSeed_x","nearTracks_PCAonSeed_y","nearTracks_PCAonSeed_z",
          "nearTracks_PCAonSeed_xerr","nearTracks_PCAonSeed_yerr","nearTracks_PCAonSeed_zerr","nearTracks_PCAonTrack_x","nearTracks_PCAonTrack_y","nearTracks_PCAonTrack_z",
          "nearTracks_PCAonTrack_xerr","nearTracks_PCAonTrack_yerr","nearTracks_PCAonTrack_zerr","nearTracks_dotprodTrack","nearTracks_dotprodSeed","nearTracks_dotprodTrackSeed2D",
          "nearTracks_dotprodTrackSeed3D","nearTracks_dotprodTrackSeedVectors2D","nearTracks_dotprodTrackSeedVectors3D","nearTracks_PCAonSeed_pvd","nearTracks_PCAonTrack_pvd",
          "nearTracks_PCAjetAxis_dist","nearTracks_PCAjetMomenta_dotprod","nearTracks_PCAjetDirs_DEta","nearTracks_PCAjetDirs_DPhi"]

    f=TFile(rootfile)
#    tree=f.Get("analyzer1/tree")
    tree=root_numpy.tree2array(f.Get('analyzer1/tree'),branches=stringa2, selection="(jet_pt>30)&&(abs(jet_eta)<2.4)", start=first, stop=last)
    print "loaded"
    tree2=root_numpy.rec2array(tree)
    print tree2.shape
    print round(time.time()-starttime,2), "reshape"
    tree3=tree2.reshape((200,36,len(tree)))
    tree3=tree3.reshape((10,720,len(tree)))
    print tree3.shape
    tree3=tree3.swapaxes(0, 2)
    t2=root_numpy.tree2array(f.Get('analyzer1/tree'), branches=stringa, selection="(jet_pt>30)&&(abs(jet_eta)<2.4)", start=first, stop=last)
    t2=root_numpy.rec2array(t2)
    print t2.shape
    t2=t2.reshape((10,len(stringa),len(tree)))
    t2=t2.swapaxes(0, 2)
    tree5=numpy.concatenate((t2, tree3), axis=1)
    print tree5.shape
    numpy.save("tvars_"+str(first)+"_"+str(last)+"_"+rootfile.split(".")[0]+".npy", tree5)
    print time.time()-starttime
    f.Close() 
    os.system("mv "+"tvars_"+str(first)+"_"+str(last)+"_"+rootfile.split(".")[0]+".npy"+" /gpfs/ddn/users/lgiannini/NN/DataMiniAODNewValidation")

Example #7

0

Show file

File: classify.py Project: qbuat/htt

def make_dataset(signals, backgrounds,
                 category, region, fields,
                 cuts=None):
    signal_arrs = []
    signal_weight_arrs = []
    background_arrs = []
    background_weight_arrs = []

    for signal in signals:
        rec = signal.merged_records(
            category=category,
            region=region,
            fields=fields,
            cuts=cuts)
        signal_weight_arrs.append(rec['weight'])
        signal_arrs.append(rec2array(rec, fields))

    for background in backgrounds:
        rec = background.merged_records(
            category=category,
            region=region,
            fields=fields,
            cuts=cuts)
        background_weight_arrs.append(rec['weight'])
        background_arrs.append(rec2array(rec, fields))

    signal_array = np.concatenate(signal_arrs)
    signal_weight_array = np.concatenate(signal_weight_arrs)
    background_array = np.concatenate(background_arrs)
    background_weight_array = np.concatenate(background_weight_arrs)

    return (signal_array, signal_weight_array,
            background_array, background_weight_array)

Example #8

0

Show file

File: functionsBDT.py Project: lecdawson/BDTAnalysis

def import_data_small_2():
    signal = root2array(BASE_PATH + "sensitivity_0nubb_1E5_Pred_With_Cut.root",
                        "Sensitivity", BRANCH_NAMES_TEST)
    signal = rec2array(signal)

    bkg2nu = root2array(
        BASE_PATH + "sensitivity_2nubb_1E5_Small_Pred_With_Cut.root",
        "Sensitivity", BRANCH_NAMES_TEST)
    bkg2nu = rec2array(bkg2nu)

    bkg214Bi = root2array(
        BASE_PATH + "sensitivity_Bi214_Foils_Small_Pred_With_Cut.root",
        "Sensitivity", BRANCH_NAMES_TEST)
    bkg214Bi = rec2array(bkg214Bi)

    bkg208Tl = root2array(
        BASE_PATH + "sensitivity_Tl208_Foils_Small_Pred_With_Cut.root",
        "Sensitivity", BRANCH_NAMES_TEST)
    bkg208Tl = rec2array(bkg208Tl)

    bkgRn = root2array(
        BASE_PATH + "sensitivity_Bi214_Wires_Small_Pred_With_Cut.root",
        "Sensitivity", BRANCH_NAMES_TEST)
    bkgRn = rec2array(bkgRn)

    return signal, bkg2nu, bkg214Bi, bkg208Tl, bkgRn

Example #9

0

Show file

File: position_ann.py Project: kalpanasingh/rat-tools

def load_data(filename, use_mc=False, cut_data=False):
    '''Load ROOT TTrees, return numpy arrays
    '''
    # Get the number of branches (+1 for radius, 2 for each array entry)
    tf = TFile(filename, "read")
    n_branches = len(tf.Get("tree").GetListOfBranches())
    n_inputs = (n_branches - 1) / 2
    # Open the files and transform branches to numpy arrays
    if use_mc is True:
        branches_in = ["hitPatternMC_{0}".format(i) for i in range(n_inputs)]
    else:
        branches_in = ["hitPatternFit_{0}".format(i) for i in range(n_inputs)]
    print "MAX:", max(branches_in), branches_in[-1]
    branches_out = ["radius"]
    # root2array converts ROOT tree entries into numpy array (branches still in lists)
    ann_inputs = root2array(filename, "tree", branches_in)
    ann_output = root2array(filename, "tree", branches_out)
    # rec2array converts the list entries to an array for each record
    data_in = rec2array(ann_inputs)
    data_out = rec2array(ann_output)
    if cut_data is True:
        # Remove samples at z > 7m
        upper_neck = data_out < 7000
        print upper_neck
        data_out = data_out[upper_neck]
        data_in = data_in[upper_neck]
    # Normalise radius (even though this is a regression problem
    # feature scaling still makes a big difference for the ANN)
    data_out = data_out / _radius_scale # On the order of the AV radius
    return data_in, data_out

Example #10

0

Show file

File: CA15_doublebTagger.py Project: deepakcern/CA15_Training

def train_sum(sample_signal, sample_bkg, tree, branch_names, selection):

    csv1 = ['SubJet_csv_1']
    csv2 = ['SubJet_csv_2']
    sig_csv1 = root2array(sample_signal, "outTree", csv1, selection=selection)
    sig_csv2 = root2array(sample_signal, "outTree", csv2, selection=selection)
    bkg_csv1 = root2array(sample_bkg, "outTree", csv1, selection=selection)
    bkg_csv2 = root2array(sample_bkg, "outTree", csv2, selection=selection)
    sig_csv1 = rec2array(sig_csv1)
    sig_csv2 = rec2array(sig_csv2)
    bkg_csv1 = rec2array(bkg_csv1)
    bkg_csv2 = rec2array(bkg_csv2)

    sig_sum_csv = [[x[0] + y[0]] for x, y in zip(sig_csv1, sig_csv2)]
    bkg_sum_csv = [[x[0] + y[0]] for x, y in zip(bkg_csv1, bkg_csv2)]
    sig_sum_csv = np.array(sig_sum_csv)
    bkg_sum_csv = np.array(bkg_sum_csv)

    print(branch_names)
    signal = get_numpy_array(sample_signal, tree, branch_names, selection)
    backgr = get_numpy_array(sample_bkg, tree, branch_names, selection)

    signal = np.append(signal, sig_sum_csv, axis=1)
    backgr = np.append(backgr, bkg_sum_csv, axis=1)
    print("signal sample and bkg num py array done")

    X, y = merge_addColoumn(signal, backgr)
    print("signal , bkg merging done ")
    sig_weight = get_weight_coloumn(sample_signal, tree, ['weight'], selection)
    bkg_weght = np.ones((backgr.shape[0], 1))
    sig_weight = np.concatenate(sig_weight, axis=0)
    bkg_weght = np.concatenate(bkg_weght, axis=0)
    weight = np.concatenate((sig_weight, bkg_weght), axis=0)
    print("weight np array done")
    print("splitting start")
    X_train, X_test, y_train, y_test, weight_train, weight_test = train_test_split(
        X, y, weight, test_size=0.33, random_state=42)
    print("splitting done")

    print("start training")

    dt = DecisionTreeClassifier(max_depth=5)
    bdt = AdaBoostClassifier(dt,
                             algorithm='SAMME',
                             n_estimators=800,
                             learning_rate=0.5)
    # bdt = GradientBoostingClassifier(dt,n_estimators=800,learning_rate=0.5)
    bdt.fit(X_train, y_train, sample_weight=weight_train)
    print("bdt had done the fitting")
    print("start testing")
    decisions = bdt.decision_function(X_test)
    print(decisions)
    fpr, tpr, thresholds = roc_curve(y_test, decisions)
    print("training done")
    return fpr, tpr

Example #11

0

Show file

File: tests.py Project: technologiclee/root_numpy

def test_rec2array():
    a = np.array([
        (12345, 2., 2.1, True),
        (3, 4., 4.2, False),
    ],
                 dtype=[('x', np.int32), ('y', np.float32), ('z', np.float64),
                        ('w', np.bool)])
    arr = rnp.rec2array(a)
    assert_array_equal(arr, np.array([[12345, 2, 2.1, 1], [3, 4, 4.2, 0]]))
    arr = rnp.rec2array(a, fields=['x', 'y'])
    assert_array_equal(arr, np.array([[12345, 2], [3, 4]]))
    # single field
    arr = rnp.rec2array(a, fields=['x'])
    assert_equal(arr.ndim, 1)
    assert_equal(arr.shape, (a.shape[0], ))

Example #12

0

Show file

File: write.py Project: david0811/RISE

def run(name, source, quick=False):
    print time.asctime(time.localtime()), "Filling BDT Branches"

    branch_names = joblib.load("pickle/variables.pkl")

    if quick == True:
        signal = joblib.load('pickle/all_signalq.pkl')
        clf = joblib.load("pickle/" + name + "quick.pkl")
    else:
        signal = joblib.load('pickle/all_signal.pkl')
        clf = joblib.load("pickle/" + name + ".pkl")

    # predict and write probability of each MC event being signal
    bdt_MC_predicted = clf.predict_proba(signal)
    bdt_MC_predicted.dtype = [('GradBoost_prob', np.float64)]
    array2root((
        np.hsplit(bdt_MC_predicted, 2)[1]
    ), "/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA_BDT.root",
               "DecayTree")

    # predict and write probability of every data event being signal
    all_data = root2array(
        "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root",
        "DecayTree", branch_names)
    all_data = rec2array(all_data)

    bdt_data_predicted = clf.predict_proba(all_data)
    bdt_data_predicted.dtype = [('GradBoost_prob', np.float64)]
    array2root((
        np.hsplit(bdt_data_predicted, 2)[1]
    ), "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root",
               "DecayTree")

    print time.asctime(time.localtime()), "Branches Filled!"

Example #13

0

Show file

File: write.py Project: david0811/RISE

def run(name, source, quick=False):
    print time.asctime(time.localtime()), "Filling BDT Branches"  

    branch_names = joblib.load("pickle/variables.pkl")
    
    if quick == True:
        signal = joblib.load('pickle/all_signalq.pkl')   
        clf = joblib.load("pickle/" + name + "quick.pkl")     
    else:
        signal = joblib.load('pickle/all_signal.pkl')
        clf = joblib.load("pickle/" + name + ".pkl")

    # predict and write probability of each MC event being signal
    bdt_MC_predicted = clf.predict_proba(signal)
    bdt_MC_predicted.dtype = [('GradBoost_prob', np.float64)]
    array2root((np.hsplit(bdt_MC_predicted,2)[1]), "/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA_BDT.root", "DecayTree")

    # predict and write probability of every data event being signal
    all_data = root2array("/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree", branch_names)
    all_data = rec2array(all_data)

    bdt_data_predicted = clf.predict_proba(all_data)
    bdt_data_predicted.dtype = [('GradBoost_prob', np.float64)]
    array2root((np.hsplit(bdt_data_predicted,2)[1]), "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree")
        
    print time.asctime(time.localtime()), "Branches Filled!"

Example #14

0

Show file

File: sample.py Project: qbuat/tauperf

 def array(self, **kwargs):
     ""
     ""
     from root_numpy import rec2array
     rec = self.records(**kwargs)
     arr = rec2array(rec)
     return arr

Example #15

0

Show file

File: RootToKerasFormat.py Project: anmalara/DeepWWTagger

 def LoadObjectVars(self, Objects):
     print "Load ", Objects
     # Vars = root2array(filenames=self.File, treename=self.TreeName, branches=self.VarNamesDict[Objects], start=0 , stop=1000)
     Vars = root2array(filenames=self.File,
                       treename=self.TreeName,
                       branches=self.VarNamesDict[Objects])
     Vars = rec2array(Vars)
     print "Make ", Objects
     if Objects == "Event":
         for col, var in enumerate(Vars[0]):
             if (isinstance(var, np.ndarray)):
                 Vars[:, col] = np.array(map(lambda x: x[0], Vars[:, col]))
         self.Vars[Objects] = Vars
     else:
         VarList = []
         for n_jet in range(0, self.nObjects):
             for col, var in enumerate(Vars[0]):
                 VarList.append(
                     np.expand_dims(np.array(
                         map(
                             lambda x: x[n_jet]
                             if x.shape[0] > n_jet else 0, Vars[:, col])),
                                    axis=1))
         self.Vars[Objects] = np.concatenate(VarList, axis=1)
     print "Shape ", Objects, ":\t", self.Vars[Objects].shape

Example #16

0

Show file

File: ztautau.py Project: mayoub/hhana

 def corrections(self, rec):
     # posterior trigger correction
     if not self.posterior_trigger_correction:
         return
     arr = rec2array(rec[['tau1_pt', 'tau2_pt']])
     weights = evaluate(self.trigger_correct, arr)
     return [weights]

Example #17

0

Show file

File: data_generator.py Project: matthewfeickert/MoMEMtaNeuralNet

    def __getitem__(self, index):  # gets the batch for the supplied index
        # return a tuple (numpy array of image, numpy array of labels) or None at epoch end
        logging.debug("-" * 80)
        logging.debug("New batch importation")
        X = np.zeros((self.batch_size, len(self.inputs)))
        Y = np.zeros((self.batch_size, len(self.outputs)))
        pointer = 0

        for f, size in self.batch_sample.items():
            size = int(size)  # For python2
            #while True:
            #    try:
            #        data = rec2array(root2array(f,treename='tree',branches=self.inputs+self.outputs,start=index*size,stop=(index+1)*size))
            #        break
            #    except OSError:
            #        logging.warning("Could not import tree in worker, will try again in 3 seconds")
            #        time.sleep(3)
            data = rec2array(
                root2array(f,
                           treename='tree',
                           branches=self.inputs + self.outputs,
                           start=index * size,
                           stop=(index + 1) * size))
            X[pointer:pointer + size, :] = data[:, len(self.inputs):]
            Y[pointer:pointer + size, :] = data[:, :len(self.outputs)]
            pointer += size
            logging.debug("%s    - Added %d entries from file %s" %
                          (self.state_set, size, os.path.basename(f)))

        if self.weights_generator == '':
            return X, Y
        else:
            W = self.weightsGen.getWeights(Y)
            return X, Y, W

Example #18

0

Show file

File: ztautau.py Project: qbuat/htt

 def corrections(self, rec):
     # posterior trigger correction
     if not self.posterior_trigger_correction:
         return
     arr = rec2array(rec[['tau1_pt', 'tau2_pt']])
     weights = evaluate(self.trigger_correct, arr)
     return [weights]

Example #19

0

Show file

File: CA15_doublebTagger.py Project: deepakcern/CA15_Training

def get_numpy_array(sample, tree, branch_names, selection):
    branch_names = [c.strip() for c in branch_names]
    branch_names = (b.replace(" ", "_") for b in branch_names)
    branch_names = list(b.replace("-", "_") for b in branch_names)
    output_arr = root2array(sample, tree, branch_names, selection=selection)
    output_arr = rec2array(output_arr)
    return output_arr

Example #20

0

Show file

File: Classes.py Project: gsaha009/HHbbWWAnalysis

    def AddToROC(self,filename):
        # Check that correct target and records #
        valid_file = False
        for key,value in self.selector.items():
            if key in os.path.basename(filename): 
                target = value
                valid_file = True
        if not valid_file: return False# If file not to be taken into account
        # recover output #
        if self.weight_name and self.weight_name!='':
            out = root2array(filename,self.tree,branches=[self.variable+self.weight_name],selection=self.cut)
        else:
            out = root2array(filename,self.tree,branches=self.variable,selection=self.cut)
        try:
            out = rec2array(out) # If not a vector, need to remove dtype
        except:
            pass
        if out.ndim==1: 
            out = out.reshape(-1,1) # vector -> array
        if out.shape[1]>1: # contains [dicriminant,weight]
            weight = out[:,1]
            out = out[:,0]

        # Add to container #
        tar = np.ones((out.shape[0],1))*target
        self.output = np.concatenate((self.output,out),axis=0)
        self.target = np.concatenate((self.target,tar),axis=0)
        if self.weight_name and self.weight_name!='':
            self.weight = weight
        return True

Example #21

0

Show file

File: CA15_doublebTagger.py Project: deepakcern/CA15_Training

def get_weight_coloumn(signal_sample, tree, weight_branch, selection):
    weight_arr = root2array(signal_sample,
                            tree,
                            weight_branch,
                            selection=selection)
    weight_arr = rec2array(weight_arr)
    return weight_arr

Example #22

0

Show file

File: selectBranches.py Project: anmalara/DeepWWTagger

def selectBranches_Candidate(file_name, tree_name, branch_names,
                             selection_cuts, isGen):
    file = root2array(filenames=file_name,
                      treename=tree_name,
                      branches=branch_names,
                      selection=selection_cuts)
    file = rec2array(file)
    if isGen:
        px_index = branch_names.index("GenCandPx")
        py_index = branch_names.index("GenCandPy")
    else:
        px_index = branch_names.index("CandPx")
        py_index = branch_names.index("CandPy")
    # print("len_file = "+str(len(file)))
    for x in range(0, len(file)):
        for y in range(0, len(branch_names)):
            file[x, y].resize(ncand, refcheck=False)
            temp = file[x, y].reshape(1, ncand)
            temp = temp.astype(float)
            if y == 0:
                info = temp
            else:
                info = np.concatenate((info, temp))
        bubbleSort(info, ncand, px_index, py_index)
        temp_jet = info
        temp_jet = temp_jet.reshape(1, len(branch_names), ncand)
        if x == 0:
            info_candidates = temp_jet
        else:
            info_candidates = np.concatenate((info_candidates, temp_jet))
        if (x - 1) % 1000 == 0:
            print("info_candidates.shape = " + str(info_candidates.shape))
    # print("done")
    return info_candidates

Example #23

0

Show file

 def array(self, **kwargs):
     ""
     ""
     from root_numpy import rec2array
     rec = self.records(**kwargs)
     arr = rec2array(rec)
     return arr

Example #24

0

Show file

File: JetTree.py Project: anmalara/Analyzer

def svars(rootfile, first, last):
    stringa = [
        "seed_pt", "seed_eta", "seed_phi", "seed_mass", "seed_dz", "seed_dxy",
        "seed_3D_ip", "seed_3D_sip", "seed_2D_ip", "seed_2D_sip",
        "seed_3D_signedIp", "seed_3D_signedSip", "seed_2D_signedIp",
        "seed_2D_signedSip", "seed_chi2reduced", "seed_nPixelHits",
        "seed_nHits", "seed_jetAxisDistance", "seed_jetAxisDlength"
    ]
    f = TFile(rootfile)
    tree = f.Get("analyzer1/tree")
    t2 = root_numpy.tree2array(tree,
                               branches=stringa,
                               selection="(jet_pt>30)&&(abs(jet_eta)<2.4)",
                               start=first,
                               stop=last)
    ll = len(t2)
    t2 = root_numpy.rec2array(t2)
    print t2.shape
    t2 = t2.reshape((10, len(stringa), ll))
    t2 = t2.swapaxes(0, 2)
    #    t2=numpy.reshape(t2, (len(t2), len(stringa), 10))
    #    print t2.shape
    numpy.save(
        "svars" + str(first) + "_" + str(last) + "_" + rootfile.split(".")[0] +
        ".npy", t2)
    print time.time() - starttime
    print t2.shape

    f.Close()

Example #25

0

Show file

def concat_ttrees_to_array(ttrees, branches=None):
    """Concatenates multiple TTrees of different classes into one ndarray."""
    rec = []

    for i in range(len(ttrees)):
        rec.append(rnp.tree2rec(ttrees[i], branches))

    return rnp.rec2array(rnp.stack(rec, fields=branches), fields=branches)

Example #26

0

Show file

File: training.py Project: cms-ttH/ttH-TauRoast

def read_inputs(config, setup):
    from ttH.TauRoast.processing import Process

    fn = os.path.join(config.get("indir", config["outdir"]), "ntuple.root")

    signal = None
    signal_weights = None
    for proc, weight in sum([cfg.items() for cfg in setup['signals']], []):
        for p in sum([Process.expand(proc)], []):
            logging.debug('reading {}'.format(p))
            d = rec2array(root2array(fn, str(p), setup['variables']))
            if isinstance(weight, float) or isinstance(weight, int):
                w = np.array([weight] * len(d))
            else:
                w = rec2array(root2array(fn, str(p), [weight])).ravel()
            w *= p.cross_section / p.events
            if signal is not None:
                signal = np.concatenate((signal, d))
                signal_weights = np.concatenate((signal_weights, w))
            else:
                signal = d
                signal_weights = w

    background = None
    background_weights = None
    for proc, weight in sum([cfg.items() for cfg in setup['backgrounds']], []):
        for p in sum([Process.expand(proc)], []):
            logging.debug('reading {}'.format(p))
            d = rec2array(root2array(fn, str(p), setup['variables']))
            if isinstance(weight, float) or isinstance(weight, int):
                w = np.array([weight] * len(d))
            else:
                w = rec2array(root2array(fn, str(p), [weight])).ravel()
            w *= p.cross_section / p.events
            if background is not None:
                background = np.concatenate((background, d))
                background_weights = np.concatenate((background_weights, w))
            else:
                background = d
                background_weights = w

    factor = np.sum(signal_weights) / np.sum(background_weights)
    logging.info("renormalizing background events by factor {}".format(factor))
    background_weights *= factor

    return signal, signal_weights, background, background_weights

Example #27

0

Show file

def get_inputs(sample_name,
               variables,
               filename=None,
               tree_name='mva',
               dir='',
               weight_name='event_weight',
               lumi=1.):
    x = None
    y = None
    w = None

    infiles = []
    xsections = []
    if filename != None:
        infiles = [dir + filename]
    else:
        if ('ttH' in sample_name):
            infiles = [dir + "mvaVars_ttH_loose.root"]
            xsections = [0.215]
        elif ('ttV' in sample_name):
            infiles = [
                dir + "mvaVars_TTZ_loose.root", dir + "mvaVars_TTW_loose.root"
            ]
            xsections = [0.253, 0.204]  # [TTZ, TTW]
        elif ('ttbar' in sample_name):
            infiles = [
                dir + "mvaVars_TTSemilep_loose.root",
                dir + "mvaVars_TTDilep_loose.root"
            ]
            xsections = [182, 87.3]  # [semilep, dilep]
        else:
            print "Pick one sample name from 'ttH', 'ttV' or 'ttbar'"
            return x, y, w

    for fn, xs in zip(infiles, xsections):
        xi = rec2array(root2array(fn, tree_name, variables))
        wi = root2array(fn, tree_name, weight_name)

        # scale weight and renormalize total weights to one
        #wi *= (xs / sum(xsections)) /np.sum(wi)

        # scale samples based on lumi and cross section
        wi *= lumi * xs / np.sum(wi)

        if x is not None:
            x = np.concatenate((x, xi))
            w = np.concatenate((w, wi))
        else:
            x = xi
            w = wi

    if ('ttH' in sample_name):
        y = np.ones(x.shape[0])
    else:
        y = np.zeros(x.shape[0])
        #y = -1*np.ones(x.shape[0])

    return x, y, w

Example #28

0

Show file

File: functions_ml.py Project: pgunnell/MLCorner

def readout_to_numpy_arrays(infilename, treename, outpath, outname, unwanted_tags, unwanted_exact_tags):
    infile = ROOT.TFile.Open(infilename)

    myoutpath = outpath
    create_path(myoutpath)

    print 'creating numpy arrays for input sample %s' % (outname)
    # Get AnalysisTree
    entries = infile.AnalysisTree.GetEntriesFast()
    # print entries
    tree = infile.Get(treename)
    leaves = tree.GetListOfLeaves()
    variables = []
    eventweights = ['eventweight']
    for leaf in leaves:
        write = True
        for tag in unwanted_tags:
            if tag in leaf.GetName(): write = False
        for tag in unwanted_exact_tags:
            if tag == leaf.GetName(): write = False
        if write: variables.append(leaf.GetName())
    print variables
    print "len(variables): ",len(variables)

    chunksize = 200000
    maxidx = int(entries/float(chunksize)) + 1
    if entries % chunksize == 0: maxidx -= 1
    print entries, chunksize, maxidx
    for i in range(maxidx):
        mymatrix = root2array(filenames=infilename, treename=treename, branches=variables, start=i*chunksize, stop=(i+1)*chunksize)
        mymatrix = rec2array(mymatrix)
        myweights = root2array(filenames=infilename, treename=treename, branches=eventweights, start=i*chunksize, stop=(i+1)*chunksize)
        myweights = rec2array(myweights)

        thisoutname = myoutpath + outname + '_' + str(i) + '.npy'
        thisoutname_weights = myoutpath + 'Weights_' + outname + '_' + str(i) + '.npy'
        np.save(thisoutname, mymatrix)
        np.save(thisoutname_weights, myweights)
        percent = float(i+1)/float(maxidx) * 100.
        sys.stdout.write( '{0:d} of {1:d} ({2:4.2f} %) jobs done.\r'.format(i+1, maxidx, percent))
        if not i == maxidx-1: sys.stdout.flush()


    with open(myoutpath + 'variable_names.pkl', 'w') as f:
        pickle.dump(variables, f)

Example #29

0

Show file

File: analysis.py Project: vincecr0ft/hhana

 def arrays(self, category, region, cuts=None, fields=None,
            clf=None,
            clf_name='classifier',
            include_weight=True,
            systematic='NOMINAL'):
     bkg_recs, sig_recs = self.records(
         category, region, cuts=cuts, fields=fields,
         clf=clf,
         clf_name=clf_name,
         include_weight=include_weight,
         systematic=systematic)
     bkg_arrs = {}
     sig_arrs = {}
     for b, rec in bkg_recs.items():
         bkg_arrs[b] = rec2array(rec)
     for s, rec in sig_recs.items():
         sig_arrs[s] = rec2array(rec)
     return bkg_arrs, sig_arrs

Example #30

0

Show file

File: load_data.py Project: pablodecm/hh2bbbb_mva

def add_classifier(data, c_name, classifier, features):
    """ Add a classifier column to the record array """ 

    # compute discriminator and append to array
    data_disc = classifier.predict_proba(rec2array(data[features]))[:,1]
    data = append_fields(data, [c_name], [data_disc],
                         asrecarray=True, usemask=False)

    return data

Example #31

0

Show file

File: analysis.py Project: mayoub/hhana

 def arrays(self, category, region, cuts=None, fields=None,
            clf=None,
            clf_name='classifier',
            include_weight=True,
            systematic='NOMINAL'):
     bkg_recs, sig_recs = self.records(
         category, region, cuts=cuts, fields=fields,
         clf=clf,
         clf_name=clf_name,
         include_weight=include_weight,
         systematic=systematic)
     bkg_arrs = {}
     sig_arrs = {}
     for b, rec in bkg_recs.items():
         bkg_arrs[b] = rec2array(rec)
     for s, rec in sig_recs.items():
         sig_arrs[s] = rec2array(rec)
     return bkg_arrs, sig_arrs

Example #32

0

Show file

File: skxgb.py Project: pekraem/Datenanalyse

  def Convert(self):
        if self.weights:
            self.variables.append(self.weights[0])
            print self.variables
	train_Signal=root2array(self.SPath, self.Streename, self.variables)
	train_Background=root2array(self.BPath, self.Btreename, self.variables)
	train_Signal=rec2array(train_Signal)
	self.train_Signal = train_Signal
	print '#Signalevents = ', len(train_Signal)
	train_Background=rec2array(train_Background)
	self.train_Background = train_Background
	print '#Backgroundevents = ', len(train_Background)
	X_train = np.concatenate((train_Signal, train_Background))
	y_train = np.concatenate((np.ones(train_Signal.shape[0]), np.zeros(train_Background.shape[0])))
	if self.StestPath=='':
            X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.33, random_state=42)
        else:
            test_Signal=root2array(self.StestPath, self.Streename, self.variables)
            test_Background=root2array(self.BtestPath, self.Btreename, self.variables)
            test_Signal=rec2array(test_Signal)
            test_Background=rec2array(test_Background)
            self.test_Signal = test_Signal
            self.test_Background = test_Background
            X_test = np.concatenate((test_Signal,test_Background))
            y_test = np.concatenate((np.ones(test_Signal.shape[0]), np.zeros(test_Background.shape[0])))
        weights = []    
        for i in X_train:
            self.train_weights.append(i[-1])
            #i.delete( i[-1] )
        X_train = np.delete(X_train, np.s_[-1], 1)
        for i in X_test:
            self.test_weights.append(i[-1])
            #i.delete(i[-1])
        X_test = np.delete(X_test, np.s_[-1], 1)
        del self.variables[-1]
        self.Var_Array = X_train
	self.ID_Array = y_train
	self.test_var=X_test
	self.test_ID=y_test
	#########################################
	#---store stuff to compare afterwards---#
	IDfile = open("ID.pkl","w") 	        #
	pickle.dump(self.test_ID,IDfile)        #
	IDfile.close()			        #

Example #33

0

Show file

File: selectBranches.py Project: anmalara/DeepWWTagger

def selectBranches_Event(file_name, tree_name, branch_names, selection_cuts):
    file = root2array(filenames=file_name,
                      treename=tree_name,
                      branches=branch_names,
                      selection=selection_cuts)
    # it needs 2 steps to a proper coversion into numpy.ndarray
    file = rec2array(file)
    file = file.astype(variable_type)
    # return numpy.ndarray whose shape is (n_events,branches.size())
    return file

Example #34

0

Show file

def add_classifier(data, c_name, classifier, features):
    """ Add a classifier column to the record array """

    # compute discriminator and append to array
    data_disc = classifier.predict_proba(rec2array(data[features]))[:, 1]
    data = append_fields(data, [c_name], [data_disc],
                         asrecarray=True,
                         usemask=False)

    return data

Example #35

0

Show file

def jvars(rootfile, first, last):
    jvars=["jet_pt","jet_eta", "jet_phi","jet_mass","jet_flavour"]
    f=TFile(rootfile)
    tree=f.Get("analyzer1/tree")
    t2=root_numpy.tree2array(tree, branches=jvars, selection="(jet_pt>30)&&(abs(jet_eta)<2.4)", start=first, stop=last)
    t2=root_numpy.rec2array(t2)
    numpy.save("jvars_"+str(first)+"_"+str(last)+"_"+rootfile.split(".")[0]+".npy", t2)
    print t2.shape
    print time.time()-starttime
    f.Close()

Example #36

0

Show file

        def Run(self, batchsize=int(1e5)):
            r"""Fill all registered histograms.

            The histograms are filled using the :func:`root_numpy.root2array` method.

            :param batchsize: number of events to processed at once (default: 100000)
            :type batchsize: ``int``
            """
            branchexprs = set()
            for histo, options in self._store:
                branchexprs.update(options["varexp"].split(":"))
                branchexprs.add("({})*({})".format(options["weight"],
                                                   options["cuts"]))
                if not options["append"]:
                    histo.Reset()
            for start in range(0, self._entries, batchsize):
                array = rnp.root2array(
                    self._filepath,
                    self._treename,
                    branches=branchexprs,
                    start=start,
                    stop=start + batchsize,
                )
                for histo, options in self._store:
                    if not ":" in options["varexp"]:
                        varexp = array[options["varexp"]]
                    else:
                        varexp = rnp.rec2array(
                            array[options["varexp"].split(":")])
                    cuts = array["({})*({})".format(options["weight"],
                                                    options["cuts"])]
                    mask = np.where(cuts != 0)
                    rnp.fill_hist(histo, varexp[mask], weights=cuts[mask])
            zeroentriesoptions = []
            for histo, options in self._store:
                options = {
                    k: v
                    for k, v in options.items()
                    if not k in ["varexp", "append"]
                }
                if histo.GetEntries(
                ) == 0 and options not in zeroentriesoptions:
                    logger.warning(
                        "No events have been extracted for tree '{}' in file '{}'"
                        "using cuts='{}' and weight='{}'!".format(
                            self._treename,
                            self._filepath,
                            options["cuts"],
                            options["weight"],
                        ))
                    zeroentriesoptions.append(options)
            logger.info(
                "Filled {} histograms using tree '{}' in file '{}'.".format(
                    len(self._store), self._treename, self._filepath))

Example #37

0

Show file

File: data_generator.py Project: kjaffel/ZA_RunIIFullAnalysis

    def __getitem__(self,index): # gets the batch for the supplied index
        # return a tuple (numpy array of image, numpy array of labels) or None at epoch end
        logging.debug("-"*80)
        logging.debug("New batch importation")
        X = np.zeros((self.batch_size,len(self.inputs)))
        Y = np.zeros((self.batch_size,len(self.outputs)))
        pointer = 0

        for f,size in self.batch_sample.items():
            size = int(size) # For python2
            X[pointer:pointer+size,:]= rec2array(root2array(f,treename='tree',branches=self.inputs,start=index*size,stop=(index+1)*size))
            Y[pointer:pointer+size,:] = rec2array(root2array(f,treename='tree',branches=self.outputs,start=index*size,stop=(index+1)*size))
            pointer += size
            logging.debug("%s    - Added %d entries from file %s"%(self.state_set,size,os.path.basename(f)))

        if self.weights_generator == '':
            return X,Y
        else:
            W = self.weightsGen.getWeights(Y)
            return X,Y,W

Example #38

0

Show file

File: classify.py Project: qbuat/htt

    def classify(self, sample, category, region,
                 cuts=None, systematic='NOMINAL'):

        if self.clfs == None:
            raise RuntimeError("you must train the classifiers first")

        partitions = sample.partitioned_records(
            category=category,
            region=region,
            fields=self.fields,
            cuts=cuts,
            systematic=systematic,
            num_partitions=2,
            return_idx=True,
            key=self.partition_key)

        score_idx = [[], []]
        for i, partition in enumerate(partitions):
            for rec, idx in partition:
                weight = rec['weight']
                arr = rec2array(rec, self.fields)
                # each classifier is never used on the partition that trained it
                scores = self.clfs[i].decision_function(arr)
                score_idx[i].append((idx, scores, weight))

        # must preserve order of scores wrt the other fields!
        # merge the scores and weights according to the idx
        merged_scores = []
        merged_weight = []
        for left, right in zip(*score_idx):
            left_idx, left_scores, left_weight = left
            right_idx, right_scores, right_weight = right
            insert_idx = np.searchsorted(left_idx, right_idx)
            scores = np.insert(left_scores, insert_idx, right_scores)
            weight = np.insert(left_weight, insert_idx, right_weight)
            merged_scores.append(scores)
            merged_weight.append(weight)

        scores = np.concatenate(merged_scores)
        weight = np.concatenate(merged_weight)

        if self.transform:
            log.info("classifier scores are transformed")
            if isinstance(self.transform, types.FunctionType):
                # user-defined transformation
                scores = self.transform(scores)
            else:
                # logistic tranformation used by TMVA (MethodBDT.cxx)
                scores = -1 + 2.0 / (1.0 +
                    np.exp(-self.clfs[0].n_estimators *
                            self.clfs[0].learning_rate * scores / 1.5))

        return scores, weight

Example #39

0

Show file

def test_rec2array():
    a = np.array([
        (12345, 2., 2.1, True),
        (3, 4., 4.2, False),],
        dtype=[
            ('x', np.int32),
            ('y', np.float32),
            ('z', np.float64),
            ('w', np.bool)])
    arr = rnp.rec2array(a)
    assert_array_equal(arr,
        np.array([
            [12345, 2, 2.1, 1],
            [3, 4, 4.2, 0]]))
    arr = rnp.rec2array(a, fields=['x', 'y'])
    assert_array_equal(arr,
        np.array([
            [12345, 2],
            [3, 4]]))
    # single field
    arr = rnp.rec2array(a, fields=['x'])
    assert_equal(arr.ndim, 1)
    assert_equal(arr.shape, (a.shape[0],))

Example #40

0

Show file

File: classify.py Project: qbuat/htt

def make_partitioned_dataset(signals, backgrounds,
                             category, region, fields,
                             partition_key,
                             cuts=None):
    signal_arrs = []
    signal_weight_arrs = []
    background_arrs = []
    background_weight_arrs = []

    for signal in signals:
        left, right = signal.partitioned_records(
            category=category,
            region=region,
            fields=fields,
            cuts=cuts,
            key=partition_key)
        signal_weight_arrs.append(
            (left['weight'], right['weight']))
        signal_arrs.append(
            (rec2array(left, fields),
            rec2array(right, fields)))

    for background in backgrounds:
        left, right = background.partitioned_records(
            category=category,
            region=region,
            fields=fields,
            cuts=cuts,
            key=partition_key)
        background_weight_arrs.append(
            (left['weight'], right['weight']))
        background_arrs.append(
            (rec2array(left, fields),
            rec2array(right, fields)))

    return (signal_arrs, signal_weight_arrs,
            background_arrs, background_weight_arrs)

Example #41

0

Show file

File: functions_Dataset.py Project: cmarinbe/usefulCode

def importROOTdata(branch_names, fName, treeName="DecayTree"):
    """Import signal and background root files to signal and background numpy arrays for the selected branches
        
        :param branch_names: names of the branches to be imported
        :type branch_names: tuple
        :param fName: name of the root file to be imported.
        :type fName: str
        :param treeName: tree name in the files.
        :type fName: str
        
        :rtype: ndarray
        """
    from root_numpy import root2array, rec2array
    data_array = root2array(fName, treeName, branch_names)
    data_array = rec2array(data_array)
    return data_array

Example #42

0

Show file

File: sample.py Project: sagittaeri/htt

    def array(self,
              category=None,
              region=None,
              fields=None,
              cuts=None,
              clf=None,
              clf_name='classifer',
              include_weight=True,
              systematic='NOMINAL'):

        return rec2array(self.merged_records(
            category=category,
            region=region,
            fields=fields,
            cuts=cuts,
            clf=clf,
            clf_name=clf_name,
            include_weight=include_weight,
            systematic=systematic))

Example #43

0

Show file

File: training.py Project: cms-ttH/ttH-TauRoast

def evaluate(config, tree, names, transform=None):
    output = []
    dtype = []
    for name in names:
        setup = load(config, name.split("_")[1])
        data = rec2array(tree2array(tree.raw(), list(transform(setup["variables"])) if transform else setup["variables"]))
        if name.startswith("sklearn"):
            fn = os.path.join(config["mvadir"], name + ".pkl")
            with open(fn, 'rb') as fd:
                bdt, label = pickle.load(fd)
            scores = []
            if len(data) > 0:
                scores = bdt.predict_proba(data)[:, 1]
            output += [scores]
            dtype += [(name, 'float64')]

        fn = os.path.join(config["mvadir"], name + ".xml")
        reader = r.TMVA.Reader("Silent")
        for var in setup['variables']:
            reader.AddVariable(var, array('f', [0.]))
        reader.BookMVA("BDT", fn)
        scores = evaluate_reader(reader, "BDT", data)
        output += [scores]
        dtype += [(name.replace("sklearn", "tmvalike"), 'float64')]

    f = r.TFile(os.path.join(config.get("mvadir", config.get("indir", config["outdir"])), "mapping.root"), "READ")
    if f.IsOpen():
        likelihood = f.Get("hTargetBinning")

        def lh(values):
            return likelihood.GetBinContent(likelihood.FindBin(*values))
        indices = dict((v, n) for n, (v, _) in enumerate(dtype))
        tt = output[indices['tmvalike_tt']]
        ttZ = output[indices['tmvalike_ttZ']]
        if len(tt) == 0:
            output += [[]]
        else:
            output += [np.apply_along_axis(lh, 1, np.array([tt, ttZ]).T)]
        dtype += [('tmvalike_likelihood', 'float64')]
        f.Close()

    data = np.array(zip(*output), dtype)
    tree.mva(array2tree(data))

Example #44

0

Show file

File: Helper.py Project: smoortga/mvaVariableStudy

def DrawCorrelationMatrixFromROOT(infile,intree,outfile,brancharray,selection="",pickEvery=None):
	X = np.ndarray((0,len(brancharray)),float) # container to hold the combined trees in numpy array structure
	treeArray = rootnp.root2array(infile,intree,brancharray,selection,0,None,pickEvery,False,'weight')
	X = rootnp.rec2array(treeArray)
	
	df = pd.DataFrame(X,columns=brancharray)
	corrmat = df.corr(method='pearson', min_periods=1)#'spearman'
	
	fig, ax1 = plt.subplots(ncols=1, figsize=(12,10))
	opts = {'cmap': plt.get_cmap("RdBu"),'vmin': corrmat.min().min(), 'vmax': corrmat.max().max()}
	heatmap1 = ax1.pcolor(corrmat, **opts)
	plt.colorbar(heatmap1, ax=ax1)
	ax1.set_title("Correlation Matrix {%s}"%selection)
	labels = corrmat.columns.values
	for ax in (ax1,):
		# shift location of ticks to center of the bins
		ax.set_xticks(np.arange(len(labels))+0.5, minor=False)
		ax.set_yticks(np.arange(len(labels))+0.5, minor=False)
		ax.set_xticklabels(labels, minor=False, ha='right', rotation=70)
		ax.set_yticklabels(labels, minor=False)
	fig.tight_layout()
	
	log.info("Dumping output in %s" %outfile)
	fig.savefig(outfile)

Example #45

0

Show file

File: add_bdt_to_root.py Project: pablodecm/hh2bbbb_mva

from root_numpy import root2array, rec2array, array2root

bdt_file = '/lustre/cmswork/hh/mvas/xgboost/train_3CSVM_0.5sig_0.7bkg_weighted.pkl'

branch_names = ["H1_pT", "H2_pT",
                "H1_dEta_abs", "H2_dEta_abs",
                "H1_dPhi_abs", "H2_dPhi_abs"]

# compute bdt values
bdt = joblib.load(bdt_file)

for root_file in args.root_files:
    print "processing {}".format(root_file)
    # load vars data from ROOT
    data = root2array(root_file, args.tree_name, branch_names)

    data_bdt = bdt.predict_proba(rec2array(data[branch_names]))[:,1]

    # save to ROOT file
    data_bdt.dtype = [(args.bdt_name, np.float32)]
    array2root(data_bdt, root_file, "tree")

Example #46

0

Show file

File: Correlation_Matrix.py Project: smoortga/CTagTraining

   base = os.path.basename(fname)
   match = fname_regex.match(base)
   if not match:
      raise ValueError("Could not match the regex to the file %s" % fname)
   flavor = match.group('flavor')
   full_category = match.group('category')
   category = [i for i in sv_categories if i in full_category][0]
   if flavor != args.flavour: 
      continue
   
   log.info('processing file %s' % fname)
   extfile = fileserver.serve(fname)
   pool_files.append(extfile)
   nfiles_per_sample = None
   tree = rootnp.root2array(extfile.path,'tree',variables,None,0,nfiles_per_sample,args.pickEvery,False,'weight')
   tree = rootnp.rec2array(tree)
   X = np.concatenate((X, tree),0)
   y = np.concatenate((y,np.ones(tree.shape[0]))) # This is needed for pandas DataFrame Structure

log.info('Converting data to pandas DataFrame structure')
# Create a pandas DataFrame for our data
# this provides many convenience functions
# for exploring your dataset
# see http://betatim.github.io/posts/sklearn-for-TMVA-users/ for more info
# need to reshape y so it is a 2D array with one column
df = pd.DataFrame(np.hstack((X, y.reshape(y.shape[0], -1))),columns=variables+['y'])

corrmat = df.drop('y', 1).corr(method='pearson', min_periods=1)

fig, ax1 = plt.subplots(ncols=1, figsize=(12,10))

Example #47

0

Show file

File: MyData.py Project: mrelich/MuonAna

def ReadData(path_to_file, sname, selection=""):

    # Make data object
    dataobj = Data()

    # Get the data
    indata = root2array(filenames = path_to_file,
                        treename  = "tree",
                        branches = dataobj.t_varnames+dataobj.w_varnames,
                        selection = selection)

    # Add an extra field for the weights
    emptydata = []
    for i in range(len(m_weightnames)):
        emptydata.append(np.zeros(len(indata),dtype=float))
    indata = append_fields(base  = indata,
                           names = m_weightnames, 
                           data  = emptydata,
                           usemask = False,
                           dtypes=float)

    # Loop and calculate the weights
    weight_tool = WeightTool()
    for i in range(len(indata)):
        
        if sname == m_sname_corsika or sname == m_sname_corsikaLE:
            indata[i][m_weightnames[0]] = weight_tool.getWeight(indata[i],sname)
            indata[i][m_weightnames[1]] = 0
            indata[i][m_weightnames[2]] = 0
        elif sname == m_sname_data:
            indata[i][m_weightnames[0]] = 1
            indata[i][m_weightnames[1]] = 0
            indata[i][m_weightnames[2]] = 0

        else:
            indata[i][m_weightnames[0]] = weight_tool.getWeight(indata[i],m_sname_E2)
            indata[i][m_weightnames[1]] = weight_tool.getWeight(indata[i],m_sname_Conv)
            indata[i][m_weightnames[2]] = weight_tool.getWeight(indata[i],m_sname_Prompt)

    # Convert to record array
    #indata = rec2array(indata,fields=dataobj.t_varnames + ['w'])
    indata  = rec2array(indata)

    # Remove nan if exists
    indata = indata[~np.isnan(indata).any(axis=1)]

    # Get Entries
    nEntries = len(indata)
        
    # Set the targets
    # 1 -- signal
    # 0 -- background
    if sname == m_sname_E2: 
        targets = np.ones(nEntries,dtype=int)
    else:  
        targets = np.zeros(nEntries,dtype=int)

    # Set properties of data object
    dataobj.setData(indata)
    dataobj.setTargets(targets)
    dataobj.setName(sname)

    #print "---------------------------------------"
    #print dataobj.data
    #print dataobj.targets
    #print ""

    return dataobj

Example #48

0

Show file

File: add_bdt_to_asc.py Project: pablodecm/hh2bbbb_mva

j_n = "pfjets[{}].{}"
branch_names = [j_n.format(i, v) for i,v in it.product(range(4), j_v)]

mix_data = np.genfromtxt(asc_file, names=branch_names)
mix_data.dtype.names = branch_names # fix names (symbols were erased)

mix_data = add_cartesian(mix_data)
mix_data = add_dijet_vars(mix_data)

bdt = joblib.load(bdt_file)

features = ["dijet[0].Pt()","dijet[1].Pt()",
            "dijet[0].DEta()","dijet[1].DEta()",
            "dijet[0].DPhi()","dijet[1].DPhi()"]

mix_data_bdt = bdt.decision_function(rec2array(mix_data[features]))

bdt_name = "bdt_value"
mix_data = append_fields(mix_data, [bdt_name], [mix_data_bdt] , asrecarray=True, usemask=False)

to_write = branch_names + [bdt_name]

np.savetxt(out_file, rec2array(mix_data[to_write]))

Example #49

0

Show file

File: scikit.py Project: cms-ttH/ttH-TauRoast

lep1tauSS_cosDeltaPhi
lep1_mt
lep1_mva
jet_deltaRavg
""".split()

with open(save, 'rd') as fd:
    bdt = pickle.load(fd)

sig = r.TH1F('sig', '', 40, 0, 1)
bkg = r.TH1F('bkg', '', 40, 0, 1)

infile = r.TFile(ntuple)

for s in signals:
    data = rec2array(root2array(ntuple, str(s), variables))
    for v in bdt.predict_proba(data)[:, 1]:
        sig.Fill(v)
for b in backgrounds:
    data = rec2array(root2array(ntuple, str(b), variables))
    for v in bdt.predict_proba(data)[:, 1]:
        bkg.Fill(v)

c = r.TCanvas()
sig.SetLineColor(r.kBlue)
sig.Scale(1. / sig.Integral())
sig.Draw()
bkg.SetLineColor(r.kRed)
bkg.Scale(1. / bkg.Integral())
bkg.Draw("same")
c.SaveAs('output.png')

Example #50

0

Show file

File: makeDiscriminatorTree.py Project: smoortga/mvaVariableStudy

# REVALIDATION OF ALL CLASSIFIERS --> dump Discriminators
#
#******************************************************

dict_Discriminators = {}

#******************************************************
# All types, all classifiers
#******************************************************

log.info('Processing: %sall types, all classifiers (including the best for each type)%s' %(Fore.BLUE,Fore.WHITE))
for t in Types:
	variables = pickle.load(open(args.Typesdir+t+"/featurenames.pkl","r"))
	variables = [x for x in variables if x != 'flavour']
	X = rootnp.root2array(args.InputFile,args.InputTree,variables,None,0,args.elements_per_sample,args.pickEvery,False,'weight')
	X = rootnp.rec2array(X)
	for c in clf_names:
		log.info('Type: %s%s%s, Classifier: %s%s%s' %(Fore.RED,t,Fore.WHITE,Fore.GREEN,c,Fore.WHITE))
		classifier = dict_clf[t+'_'+c]
		dict_Discriminators[t+'_'+c] = classifier.predict_proba(X)[:,1]
	
	best_clf_name = dict_pickles["Best"][t][0]
	best_classifier = dict_clf[t+'_BEST_'+best_clf_name]
	log.info('Type: %s%s%s, Best Classifier is: %s%s%s' %(Fore.RED,t,Fore.WHITE,Fore.GREEN,best_clf_name,Fore.WHITE))
	dict_Discriminators[t+'_BEST_'+best_clf_name] = best_classifier.predict_proba(X)[:,1] 


#******************************************************
# CombinedMVA
#******************************************************

Example #51

0

Show file

File: test_utils.py Project: ndawe/root_numpy

def test_rec2array():
    # scalar fields
    a = np.array([
        (12345, 2., 2.1, True),
        (3, 4., 4.2, False),],
        dtype=[
            ('x', np.int32),
            ('y', np.float32),
            ('z', np.float64),
            ('w', np.bool)])

    arr = rnp.rec2array(a)
    assert_array_equal(arr,
        np.array([
            [12345, 2, 2.1, 1],
            [3, 4, 4.2, 0]]))

    arr = rnp.rec2array(a, fields=['x', 'y'])
    assert_array_equal(arr,
        np.array([
            [12345, 2],
            [3, 4]]))

    # single scalar field
    arr = rnp.rec2array(a, fields=['x'])
    assert_array_equal(arr, np.array([[12345], [3]], dtype=np.int32))
    # single scalar field simplified
    arr = rnp.rec2array(a, fields='x')
    assert_array_equal(arr, np.array([12345, 3], dtype=np.int32))

    # case where array has single record
    assert_equal(rnp.rec2array(a[:1]).shape, (1, 4))
    assert_equal(rnp.rec2array(a[:1], fields=['x']).shape, (1, 1))
    assert_equal(rnp.rec2array(a[:1], fields='x').shape, (1,))

    # array fields
    a = np.array([
        ([1, 2, 3], [4.5, 6, 9.5],),
        ([4, 5, 6], [3.3, 7.5, 8.4],),],
        dtype=[
            ('x', np.int32, (3,)),
            ('y', np.float32, (3,))])

    arr = rnp.rec2array(a)
    assert_array_almost_equal(arr,
        np.array([[[1, 4.5],
                   [2, 6],
                   [3, 9.5]],
                  [[4, 3.3],
                   [5, 7.5],
                   [6, 8.4]]]))

    # single array field
    arr = rnp.rec2array(a, fields=['y'])
    assert_array_almost_equal(arr,
        np.array([[[4.5], [6], [9.5]],
                  [[3.3], [7.5], [8.4]]]))
    # single array field simplified
    arr = rnp.rec2array(a, fields='y')
    assert_array_almost_equal(arr,
        np.array([[4.5, 6, 9.5],
                  [3.3, 7.5, 8.4]]))

    # case where array has single record
    assert_equal(rnp.rec2array(a[:1], fields=['y']).shape, (1, 3, 1))
    assert_equal(rnp.rec2array(a[:1], fields='y').shape, (1, 3))

    # lengths mismatch
    a = np.array([
        ([1, 2], [4.5, 6, 9.5],),
        ([4, 5], [3.3, 7.5, 8.4],),],
        dtype=[
            ('x', np.int32, (2,)),
            ('y', np.float32, (3,))])
    assert_raises(ValueError, rnp.rec2array, a)

    # mix of scalar and array fields should fail
    a = np.array([
        (1, [4.5, 6, 9.5],),
        (4, [3.3, 7.5, 8.4],),],
        dtype=[
            ('x', np.int32),
            ('y', np.float32, (3,))])
    assert_raises(ValueError, rnp.rec2array, a)

Example #52

0

Show file

File: classify.py Project: sagittaeri/htt

    def train(self,
              signals,
              backgrounds,
              cuts=None,
              max_sig=None,
              max_bkg=None,
              norm_sig_to_bkg=True,
              same_size_sig_bkg=True, # NOTE: if True this crops signal a lot!!
              remove_negative_weights=False,
              grid_search=True,
              cv_nfold=5,
              use_cache=True,
              **clf_params):
        """
        Determine best BDTs on left and right partitions. Each BDT will then be
        used on the other partition.
        """
        if use_cache and not self.clfs:
            if self.load():
                return

        signal_recs = []
        signal_arrs = []
        signal_weight_arrs = []

        for signal in signals:
            left, right = signal.partitioned_records(
                category=self.category,
                region=self.region,
                fields=self.all_fields,
                cuts=cuts,
                key=self.partition_key)
            signal_weight_arrs.append(
               (left['weight'], right['weight']))
            signal_arrs.append(
               (rec2array(left, self.fields),
                rec2array(right, self.fields)))
            signal_recs.append((left, right))

        background_recs = []
        background_arrs = []
        background_weight_arrs = []

        for background in backgrounds:
            left, right = background.partitioned_records(
                category=self.category,
                region=self.region,
                fields=self.all_fields,
                cuts=cuts,
                key=self.partition_key)
            background_weight_arrs.append(
               (left['weight'], right['weight']))
            background_arrs.append(
               (rec2array(left, self.fields),
                rec2array(right, self.fields)))
            background_recs.append((left, right))

        self.clfs = [None, None]

        for partition_idx in range(2):

            clf_filename = os.path.join(CACHE_DIR, 'classify',
                'clf_%s%s_%d' % (
                self.category.name, self.clf_output_suffix, partition_idx))

            # train a classifier
            # merge arrays and create training samples
            signal_train = np.concatenate(map(itemgetter(partition_idx),
                signal_arrs))
            signal_weight_train = np.concatenate(map(itemgetter(partition_idx),
                signal_weight_arrs))
            background_train = np.concatenate(map(itemgetter(partition_idx),
                background_arrs))
            background_weight_train = np.concatenate(map(itemgetter(partition_idx),
                background_weight_arrs))

            if remove_negative_weights:
                # remove samples from the training sample with a negative weight
                signal_train = signal_train[signal_weight_train >= 0]
                background_train = background_train[background_weight_train >= 0]
                signal_weight_train = signal_weight_train[signal_weight_train >= 0]
                background_weight_train = background_weight_train[background_weight_train >= 0]

            if max_sig is not None and max_sig < len(signal_train):
                subsample = np.random.permutation(len(signal_train))[:max_sig_train]
                signal_train = signal_train[subsample]
                signal_weight_train = signal_weight_train[subsample]

            if max_bkg is not None and max_bkg < len(background_train):
                subsample = np.random.permutation(len(background_train))[:max_bkg_train]
                background_train = background_train[subsample]
                background_weight_train = background_weight_train[subsample]

            if same_size_sig_bkg:
                if len(background_train) > len(signal_train):
                    # random subsample of background so it's the same size as signal
                    subsample = np.random.permutation(
                        len(background_train))[:len(signal_train)]
                    background_train = background_train[subsample]
                    background_weight_train = background_weight_train[subsample]
                elif len(background_train) < len(signal_train):
                    # random subsample of signal so it's the same size as background
                    subsample = np.random.permutation(
                        len(signal_train))[:len(background_train)]
                    signal_train = signal_train[subsample]
                    signal_weight_train = signal_weight_train[subsample]

            if norm_sig_to_bkg:
                # normalize signal to background
                signal_weight_train *= (
                    background_weight_train.sum() / signal_weight_train.sum())

            log.info("Training Samples:")
            log.info("Signal: %d events, %s features" % signal_train.shape)
            log.info("Sum(signal weights): %f" % signal_weight_train.sum())
            log.info("Background: %d events, %s features" % background_train.shape)
            log.info("Sum(background weight): %f" % background_weight_train.sum())
            log.info("Total: %d events" % (
                signal_train.shape[0] +
                background_train.shape[0]))

            sample_train = np.concatenate((background_train, signal_train))
            sample_weight_train = np.concatenate(
                (background_weight_train, signal_weight_train))
            labels_train = np.concatenate(
                (np.zeros(len(background_train)), np.ones(len(signal_train))))

            if self.standardize: # TODO use same std for classification
                sample_train = std(sample_train)

            # random permutation of training sample
            perm = np.random.permutation(len(labels_train))
            sample_train = sample_train[perm]
            sample_weight_train = sample_weight_train[perm]
            labels_train = labels_train[perm]

            log.info("training a new classifier...")

            #log.info("plotting input variables as they are given to the BDT")
            ## draw plots of the input variables
            #for i, branch in enumerate(self.fields):
            #    log.info("plotting %s ..." % branch)
            #    branch_data = sample_train[:,i]
            #    if 'scale' in variables.VARIABLES[branch]:
            #        branch_data *= variables.VARIABLES[branch]['scale']
            #    _min, _max = branch_data.min(), branch_data.max()
            #    plt.figure()
            #    plt.hist(branch_data[labels_train==0],
            #            bins=20, range=(_min, _max),
            #            weights=sample_weight_train[labels_train==0],
            #            label='Background', histtype='stepfilled',
            #            alpha=.5)
            #    plt.hist(branch_data[labels_train==1],
            #            bins=20, range=(_min, _max),
            #            weights=sample_weight_train[labels_train==1],
            #            label='Signal', histtype='stepfilled', alpha=.5)
            #    label = variables.VARIABLES[branch]['title']
            #    if 'units' in variables.VARIABLES[branch]:
            #        label += ' [%s]' % variables.VARIABLES[branch]['units']
            #    plt.xlabel(label)
            #    plt.legend()
            #    plt.savefig(os.path.join(PLOTS_DIR, 'train_var_%s_%s%s.png' % (
            #        self.category.name, branch, self.output_suffix)))

            #log.info("plotting sample weights ...")
            #_min, _max = sample_weight_train.min(), sample_weight_train.max()
            #plt.figure()
            #plt.hist(sample_weight_train[labels_train==0],
            #        bins=20, range=(_min, _max),
            #        label='Background', histtype='stepfilled',
            #        alpha=.5)
            #plt.hist(sample_weight_train[labels_train==1],
            #        bins=20, range=(_min, _max),
            #        label='Signal', histtype='stepfilled', alpha=.5)
            #plt.xlabel('sample weight')
            #plt.legend()
            #plt.savefig(os.path.join(PLOTS_DIR, 'train_sample_weight_%s%s.png' % (
            #    self.category.name, self.output_suffix)))

            if partition_idx == 0:

                # grid search params
                min_leaf_high = int((sample_train.shape[0] / 8) *
                    (cv_nfold - 1.) / cv_nfold)
                min_leaf_low = max(10, int(min_leaf_high / 100.))

                min_leaf_step = max((min_leaf_high - min_leaf_low) / 50, 1)
                max_n_estimators = 200
                min_n_estimators = 1
                n_estimators_step = 50

                min_samples_leaf = range(
                    min_leaf_low, min_leaf_high, min_leaf_step)

                #n_estimators = range(
                #    min_n_estimators, max_n_estimators, n_estimators_step)

                n_estimators = np.power(2, np.arange(0, 8))

                grid_params = {
                    'base_estimator__min_samples_leaf': min_samples_leaf,
                    #'n_estimators': n_estimators
                }

                #AdaBoostClassifier.staged_score = staged_score

                clf = AdaBoostClassifier(
                    DecisionTreeClassifier(),
                    learning_rate=.1,
                    algorithm='SAMME.R',
                    random_state=0)

                grid_clf = BoostGridSearchCV(
                    clf, grid_params,
                    max_n_estimators=max_n_estimators,
                    min_n_estimators=min_n_estimators,
                    #n_estimators_step=1,
                    # can use default ClassifierMixin score
                    #score_func=precision_score,
                    cv = StratifiedKFold(labels_train, cv_nfold),
                    n_jobs=20)

                #grid_clf = GridSearchCV(
                #    clf, grid_params,
                #    # can use default ClassifierMixin score
                #    #score_func=precision_score,
                #    cv = StratifiedKFold(labels_train, cv_nfold),
                #    n_jobs=20)

                log.info("")
                log.info("using a %d-fold cross validation" % cv_nfold)
                log.info("performing a grid search over these parameter values:")
                for param, values in grid_params.items():
                    log.info('{0} {1}'.format(param.split('__')[-1], values))
                log.info("Minimum number of classifiers: %d" % min_n_estimators)
                log.info("Maximum number of classifiers: %d" % max_n_estimators)
                log.info("")
                log.info("training new classifiers ...")

                grid_clf.fit(
                    sample_train, labels_train,
                    sample_weight=sample_weight_train)

                clf = grid_clf.best_estimator_
                grid_scores = grid_clf.grid_scores_

                log.info("Best score: %f" % grid_clf.best_score_)
                log.info("Best Parameters:")
                log.info(grid_clf.best_params_)

                # plot a grid of the scores
                plot_grid_scores(
                    grid_scores,
                    best_point={
                        'base_estimator__min_samples_leaf':
                        clf.base_estimator.min_samples_leaf,
                        'n_estimators':
                        clf.n_estimators},
                    params={
                        'base_estimator__min_samples_leaf':
                        'min leaf',
                        'n_estimators':
                        'trees'},
                    name=self.category.name + self.output_suffix + "_%d" % partition_idx)

                # scale up the min-leaf and retrain on the whole set
                min_samples_leaf = clf.base_estimator.min_samples_leaf

                clf = sklearn.clone(clf)
                clf.base_estimator.min_samples_leaf = int(
                        min_samples_leaf *
                            cv_nfold / float(cv_nfold - 1))

                clf.fit(sample_train, labels_train,
                        sample_weight=sample_weight_train)

                log.info("After scaling up min_leaf")
                out = StringIO()
                print >> out
                print >> out
                print >> out, clf
                log.info(out.getvalue())

            else: # training on the other partition
                log.info("training a new classifier ...")

                # use same params as in first partition
                clf = sklearn.clone(clf)
                out = StringIO()
                print >> out
                print >> out
                print >> out, clf
                log.info(out.getvalue())

                clf.fit(sample_train, labels_train,
                        sample_weight=sample_weight_train)

            if isinstance(clf, AdaBoostClassifier):
                # export to graphviz dot format
                if os.path.isdir(clf_filename):
                    shutil.rmtree(clf_filename)
                os.mkdir(clf_filename)

                for itree, tree in enumerate(clf):
                    export_graphviz(tree,
                        out_file=os.path.join(
                            clf_filename,
                            'tree_{0:d}.dot'.format(itree)),
                        feature_names=self.all_fields)

            with open('{0}.pickle'.format(clf_filename), 'w') as f:
                pickle.dump(clf, f)

            print_feature_ranking(clf, self.fields)

            self.clfs[(partition_idx + 1) % 2] = clf

Example #53

0

Show file

File: Helper.py Project: smoortga/mvaVariableStudy

def Optimize(name,X,y,features_array,signal_selection,bkg_selection,DumpDiscriminators=False,DumpFile="",Optmization_fraction = 0.1,train_test_splitting=0.2,verbosity=False):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_test_splitting)
	X_train_skimmed = np.asarray([X_train[i] for i in range(len(X_train)) if i%int(1./Optmization_fraction) == 0]) # optimization only on 10 %
	y_train_skimmed = np.asarray([y_train[i] for i in range(len(y_train)) if i%int(1./Optmization_fraction) == 0])
	
	
	Classifiers = {}
	
	#
	# GBC
	#
	log.info('%s %s %s: Starting to process %s Gradient Boosting Classifier %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE))
	
	gbc_parameters = {'n_estimators':list([50,100,200]), 'max_depth':list([5,10,15]),'min_samples_split':list([int(0.005*len(X_train_skimmed)), int(0.01*len(X_train_skimmed))]), 'learning_rate':list([0.05,0.1])}
	gbc_clf = GridSearchCV(GradientBoostingClassifier(), gbc_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(GradientBoostingClassifier(), gbc_parameters, n_jobs=-1, verbose=0, cv=2)
	gbc_clf.fit(X_train_skimmed,y_train_skimmed)
	
	gbc_best_clf = gbc_clf.best_estimator_
	if verbosity:
		log.info('Parameters of the best classifier: %s' % str(gbc_best_clf.get_params()))
	gbc_best_clf.verbose = 2
	gbc_best_clf.fit(X_train,y_train)
	gbc_disc = gbc_best_clf.predict_proba(X_test)[:,1]
	gbc_fpr, gbc_tpr, gbc_thresholds = roc_curve(y_test, gbc_disc)
	
	Classifiers["GBC"]=(gbc_best_clf,y_test,gbc_disc,gbc_fpr,gbc_tpr,gbc_thresholds)
	
	
	
	#
	# Randomized Forest
	#
	log.info('%s %s %s: Starting to process %s Randomized Forest Classifier %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE))
	
	rf_parameters = {'n_estimators':list([50,100,200]), 'max_depth':list([5,10,15]),'min_samples_split':list([int(0.005*len(X_train_skimmed)), int(0.01*len(X_train_skimmed))]), 'max_features':list(["sqrt","log2",0.5])}
	rf_clf = GridSearchCV(RandomForestClassifier(n_jobs=5), rf_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(RandomForestClassifier(n_jobs=5), rf_parameters, n_jobs=-1, verbose=0, cv=2)
	rf_clf.fit(X_train_skimmed,y_train_skimmed)
	
	rf_best_clf = rf_clf.best_estimator_
	if verbosity:
		log.info('Parameters of the best classifier: %s' % str(rf_best_clf.get_params()))
	rf_best_clf.verbose = 2
	rf_best_clf.fit(X_train,y_train)
	rf_disc = rf_best_clf.predict_proba(X_test)[:,1]
	rf_fpr, rf_tpr, rf_thresholds = roc_curve(y_test, rf_disc)
	
	Classifiers["RF"]=(rf_best_clf,y_test,rf_disc,rf_fpr,rf_tpr,rf_thresholds)
	
	
	
	#
	# Stochastic Gradient Descent
	#
	log.info('%s %s %s: Starting to process %s Stochastic Gradient Descent %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE))
	
	sgd_parameters = {'loss':list(['log','modified_huber']), 'penalty':list(['l2','l1','elasticnet']),'alpha':list([0.0001,0.00005,0.001]), 'n_iter':list([10,50,100])}
	sgd_clf = GridSearchCV(SGDClassifier(learning_rate='optimal'), sgd_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(SGDClassifier(learning_rate='optimal'), sgd_parameters, n_jobs=-1, verbose=0, cv=2)
	sgd_clf.fit(X_train_skimmed,y_train_skimmed)
	
	sgd_best_clf = sgd_clf.best_estimator_
	if verbosity:
		log.info('Parameters of the best classifier: %s' % str(sgd_best_clf.get_params()))
	sgd_best_clf.verbose = 2
	sgd_best_clf.fit(X_train,y_train)
	sgd_disc = sgd_best_clf.predict_proba(X_test)[:,1]
	sgd_fpr, sgd_tpr, sgd_thresholds = roc_curve(y_test, sgd_disc)
	
	Classifiers["SGD"]=(sgd_best_clf,y_test,sgd_disc,sgd_fpr,sgd_tpr,sgd_thresholds)
	
	
	
	#
	# Nearest Neighbors
	#
	log.info('%s %s %s: Starting to process %s Nearest Neighbors %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE))
	
	knn_parameters = {'n_neighbors':list([5,10,50,100]), 'algorithm':list(['ball_tree','kd_tree','brute']),'leaf_size':list([20,30,40]), 'metric':list(['euclidean','minkowski','manhattan','chebyshev'])}
	knn_clf = GridSearchCV(KNeighborsClassifier(), knn_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(KNeighborsClassifier(), knn_parameters, n_jobs=-1, verbose=0, cv=2)
	knn_clf.fit(X_train_skimmed,y_train_skimmed)
	
	knn_best_clf = knn_clf.best_estimator_
	if verbosity:
		log.info('Parameters of the best classifier: %s' % str(knn_best_clf.get_params()))
	knn_best_clf.verbose = 2
	knn_best_clf.fit(X_train,y_train)
	knn_disc = knn_best_clf.predict_proba(X_test)[:,1]
	knn_fpr, knn_tpr, knn_thresholds = roc_curve(y_test, knn_disc)
	
	Classifiers["kNN"]=(knn_best_clf,y_test,knn_disc,knn_fpr,knn_tpr,knn_thresholds)
	
	
	
	
	#
	# Naive Bayes (Likelihood Ratio)
	#
	log.info('%s %s %s: Starting to process %s Naive Bayes (Likelihood Ratio) %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE))
	
	nb_best_clf = GaussianNB() # There is no tuning of a likelihood ratio!
	if verbosity:
		log.info('Parameters of the best classifier: A simple likelihood ratio has no parameters to be tuned!')
	nb_best_clf.verbose = 2
	nb_best_clf.fit(X_train,y_train)
	nb_disc = nb_best_clf.predict_proba(X_test)[:,1]
	nb_fpr, nb_tpr, nb_thresholds = roc_curve(y_test, nb_disc)
	
	Classifiers["NB"]=(nb_best_clf,y_test,nb_disc,nb_fpr,nb_tpr,nb_thresholds)
	
	
	
	#
	# Multi-Layer Perceptron (Neural Network)
	#
	log.info('%s %s %s: Starting to process %s Multi-Layer Perceptron (Neural Network) %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE))
	
	mlp_parameters = {'activation':list(['tanh','relu']), 'hidden_layer_sizes':list([10,(5,10),(10,15)]), 'algorithm':list(['adam']), 'alpha':list([0.0001,0.00005]), 'tol':list([0.00001,0.00005,0.0001]), 'learning_rate_init':list([0.001,0.005,0.0005])}
	mlp_clf = GridSearchCV(MLPClassifier(max_iter = 500), mlp_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(MLPClassifier(max_iter = 500), mlp_parameters, n_jobs=-1, verbose=0, cv=2) #learning_rate = 'adaptive'
	mlp_clf.fit(X_train_skimmed,y_train_skimmed)
	
	mlp_best_clf = mlp_clf.best_estimator_
	if verbosity:
		log.info('Parameters of the best classifier: %s' % str(mlp_best_clf.get_params()))
	mlp_best_clf.verbose = 2
	mlp_best_clf.fit(X_train,y_train)
	mlp_disc = mlp_best_clf.predict_proba(X_test)[:,1]
	mlp_fpr, mlp_tpr, mlp_thresholds = roc_curve(y_test, mlp_disc)
	
	Classifiers["MLP"]=(mlp_best_clf,y_test,mlp_disc,mlp_fpr,mlp_tpr,mlp_thresholds)
	
	
	
	

	
	#
	# Support Vector Machine
	#
	log.info('%s %s %s: Starting to process %s Support Vector Machine %s' % (Fore.GREEN,name,Fore.WHITE,Fore.BLUE,Fore.WHITE))
	
	svm_parameters = {'kernel':list(['rbf']), 'gamma':list(['auto',0.05]), 'C':list([0.9,1.0])}
	svm_clf = GridSearchCV(SVC(probability=True), svm_parameters, n_jobs=-1, verbose=3, cv=2) if verbosity else GridSearchCV(SVC(probability=True), svm_parameters, n_jobs=-1, verbose=0, cv=2)
	svm_clf.fit(X_train_skimmed,y_train_skimmed)
	
	svm_best_clf = svm_clf.best_estimator_
	if verbosity:
		log.info('Parameters of the best classifier: %s' % str(svm_best_clf.get_params()))
	svm_best_clf.verbose = 2
	#svm_best_clf.fit(X_train,y_train)
	svm_disc = svm_best_clf.predict_proba(X_test)[:,1]
	svm_fpr, svm_tpr, svm_thresholds = roc_curve(y_test, svm_disc)
	
	Classifiers["SVM"]=(svm_best_clf,y_test,svm_disc,svm_fpr,svm_tpr,svm_thresholds)
	
	
	if DumpDiscriminators:
		XX = rootnp.root2array(DumpFile,'tree',features_array,None,0,None,None,False,'weight')
		XX = rootnp.rec2array(XX)
		
		ordered_MVAs = ['GBC','RF','SVM','SGD','kNN','NB','MLP']
		dict_Discriminators = {}
		for c in ordered_MVAs:
			classifier = Classifiers[c][0]
			dict_Discriminators[name+'_'+c] = classifier.predict_proba(XX)[:,1]
		
		inputfile = ROOT.TFile(DumpFile)
		inputtree = inputfile.Get('tree')
		inputtree.SetBranchStatus("*",1)
		branch_list = inputtree.GetListOfBranches()
		branch_name_list = [d.GetName() for d in branch_list]
		for mva in ordered_MVAs:
			branch_name = name+"_"+mva
			if branch_name in branch_name_list:
				inputtree.SetBranchStatus(branch_name,0)
			
		newfile = ROOT.TFile(DumpFile.split('.root')[0]+'_tmp.root','RECREATE')
		newtree = inputtree.CloneTree(0)
		
		dict_Leaves = {}
		for mva in ordered_MVAs:
			branch_name = name+"_"+mva
			dict_Leaves[branch_name] = array('d',[0])
			newtree.Branch(branch_name, dict_Leaves[branch_name], branch_name + "/D")
		
		
		log.info('%s: Starting to process the output tree' %name)
		nEntries = inputtree.GetEntries()
		for i in range(nEntries):
			if i%1000 == 0: log.info('Processing event %s/%s (%s%.2f%s%%)' %(i,nEntries,Fore.GREEN,100*float(i)/float(nEntries),Fore.WHITE))
			inputtree.GetEntry(i)
			for key,value in dict_Discriminators.iteritems():
				dict_Leaves[key][0] = value[i]
			newtree.Fill()

		newtree.Write()
		newfile.Close()
		inputfile.Close()
		
		os.system('cp %s %s'%(DumpFile.split('.root')[0]+'_tmp.root',DumpFile))
		os.system('rm %s'%DumpFile.split('.root')[0]+'_tmp.root')

		log.info('Done: output file dumped in %s' %DumpFile)
	
	
	return Classifiers

Example #54

0

Show file

File: datasets.py Project: david0811/RISE

def run(source, quick=False):
    print time.asctime(time.localtime()), "Copying datasets"

    branch_names = []

    with open(source, 'rb') as csvfile:
        reader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in reader:
            branch_names.append(row[0])

    if quick == True:
        step_size = 100
    else:
        step_size = 1
        
    if source == "BDTvarBs2phiphi.csv":
        myselection =  "B_s0_MM > 5486.77"
        backgr = root2array("/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA.root",
                            "DecayTree",
                            branch_names,
                            myselection,
                            step = step_size)  
        backgr = rec2array(backgr)
    
        signal = root2array("/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA.root",
                            "DecayTree",
                            branch_names,
                            step = step_size)
        signal = rec2array(signal)     

    # data contains every data point (later split into evaluation and test)
    data = np.concatenate((signal, backgr))
    # output contains binary class of data (later split into evaluation and test)
    output = np.concatenate((np.ones(signal.shape[0]),
                    np.zeros(backgr.shape[0])))

    frac = 0.5

    data_dev, data_eval, output_dev, output_eval = train_test_split(data, output,
                                              test_size=0.33, random_state=492)
    data_train, data_test, output_train, output_test = train_test_split(data_dev, output_dev,
                                                            test_size=frac, random_state=42)

    joblib.dump(branch_names, 'pickle/variables.pkl')

    print time.asctime(time.localtime()), "Real Data contains", len(data), "entries. Training on ", len(data)*frac, "Entries"
    print time.asctime(time.localtime()), "Monte Carlo contains", len(signal), "entries. Training on ", len(signal)*frac, "Entries"
    
    if quick == True:
        joblib.dump(signal, 'pickle/all_signalq.pkl')
        joblib.dump(data, 'pickle/all_dataq.pkl')
        joblib.dump(data_dev, 'pickle/datadevq.pkl')
        joblib.dump(data_eval, 'pickle/dataevq.pkl')
        joblib.dump(output_dev, 'pickle/outputdevq.pkl')
        joblib.dump(output_eval, 'pickle/outputevq.pkl')
        joblib.dump(data_train, 'pickle/dataq.pkl')
        joblib.dump(data_test, 'pickle/datatestq.pkl')
        joblib.dump(output_train, 'pickle/outputq.pkl')
        joblib.dump(output_test, 'pickle/outputtestq.pkl')
        
    else:
        joblib.dump(signal, 'pickle/all_signal.pkl')
        joblib.dump(data, 'pickle/all_data.pkl')
        joblib.dump(data_dev, 'pickle/datadev.pkl')
        joblib.dump(data_eval, 'pickle/dataev.pkl')
        joblib.dump(output_dev, 'pickle/outputdev.pkl')
        joblib.dump(output_eval, 'pickle/outputev.pkl')
        joblib.dump(data_train, 'pickle/data.pkl')
        joblib.dump(data_test, 'pickle/datatest.pkl')
        joblib.dump(output_train, 'pickle/output.pkl')
        joblib.dump(output_test, 'pickle/outputtest.pkl')   
    
    print time.asctime(time.localtime()), "Datasets produced!"

Example #55

0

Show file

File: featureCharacterzation.py Project: smoortga/mvaVariableStudy

parser.add_argument('--pickEvery', type=int, default=10, help='pick one element every ...')

args = parser.parse_args()

if args.batch: ROOT.gROOT.SetBatch(True)

features = general+vertex+leptons

filename = "./TTjets.root"
treename = "tree"
File = TFile(filename)
tree = File.Get(treename)

X = np.ndarray((0,len(features)),float) # container to hold the combined trees in numpy array structure
treeArray = rootnp.root2array(filename,treename,features,None,0,args.element_per_sample,args.pickEvery,False,'weight')
X = rootnp.rec2array(treeArray)

flavours = rootnp.root2array(filename,treename,"flavour",None,0,args.element_per_sample,args.pickEvery,False,'weight')
y = np.ones(len(flavours))
assert args.signal == "C" or args.signal == "B" or args.signal == "DUSG", "Invalid signal flavour: " + args.signal + ", must be C, B or DUSG"
signalselection = ""
bckgrselection = ""
if args.signal == "C":
	for i,fl in enumerate(flavours):
		y[i] = 1 if abs(fl) == 4 else 0
	signalselection = "flavour == 4"
	assert args.bkg == "DUSG" or args.bkg == "B", "Invalid background flavour: " + args.bkg + ", must be either DUSG or B for signal flavour: " + args.signal
	if args.bkg == "DUSG": bckgrselection = "flavour != 4 && flavour != 5"
	elif args.bkg == "B": bckgrselection = "flavour == 5"
elif args.signal == "B":
	for i,fl in enumerate(flavours):

Example #56

0

Show file

File: Helper.py Project: smoortga/mvaVariableStudy

def BestClassifier(Classifiers,FoM,typ_name='',features_array=[],signal_selection='',bkg_selection='',DumpDiscriminators=False,DumpFile=""):
	"""
	Goal: select from a set of classifier dictionaries (containing the name, object,discriminators, tpr, ...) the best one based on Figure of Merit FoM
	returns: name_of_best_clf,best_clf_object 
	"""
	assert FoM == 'AUC' or FoM == 'OOP' or FoM == 'ACC' or FoM == 'PUR', "Invalid Figure of Merit: " + FoM
	
	AUC_tmp = {}
	OOP_tmp = {}
	PUR_tmp = {}
	ACC_tmp = {}
	for name, clf in Classifiers.items():
		#if idx == 0: clf_names.append(name)
		y_true = clf[1]
		disc = clf[2]
		fpr = clf[3]
		tpr = clf[4]
		thres = clf[5]
		disc_s = disc[y_true == 1]
		disc_b = disc[y_true == 0]
		tp = [len(disc_s[disc_s>=t]) for t in thres]
		fp = [len(disc_b[disc_b>=t]) for t in thres]
		tn = [len(disc_b[disc_b<t]) for t in thres]
		fn = [len(disc_s[disc_s<t]) for t in thres]
		
		#
		# Area under ROC-curve
		#
		if FoM == 'AUC':
			AUC_tmp[name]=roc_auc_score(y_true,disc)
			
			
		#
		# Optimal Operating Point
		#
		elif FoM == 'OOP':
			dist = [math.sqrt((i-1)**2 + (j-0)**2) for i,j in zip(tpr,fpr)]
			OOP_tmp[name] = 1-min(dist)
			
			
		#
		# Purity
		#
		elif FoM == 'PUR':
			atEff = 0.5
			pur = [float(i)/float(i+j) if (i+j != 0) else 0 for i,j in zip(tp,fp)]
			val, dx = min((val, dx) for (dx, val) in enumerate([abs(atEff-i) for i in tpr]))# point with eff closes to [atEff]
			PUR_tmp[name] = pur[dx] # Purity at [atEff]% efficiency
			
		
		#
		# Accuracy
		#
		elif FoM == 'ACC':
			Acc = [float(i+j)/float(i+j+k+l) if (i+j+k+l !=0) else 0 for i,j,k,l in zip(tp,tn,fp,fn)]
			ACC_tmp[name] = Acc[dx] # Accuracy at [atEff]% efficiency
			
	
	if DumpDiscriminators:
		XX = rootnp.root2array(DumpFile,'tree',features_array,None,0,None,None,False,'weight')
		XX = rootnp.rec2array(XX)
		
		dict_Discriminators = {}
		classifier = Classifiers[max(AUC_tmp.iteritems(), key=itemgetter(1))[0]][0]
		best_mva_name = max(AUC_tmp.iteritems(), key=itemgetter(1))[0]
		dict_Discriminators[typ_name+'_BEST_'+best_mva_name] = classifier.predict_proba(XX)[:,1]
		
		inputfile = ROOT.TFile(DumpFile)
		inputtree = inputfile.Get('tree')
		inputtree.SetBranchStatus("*",1)
		branch_list = inputtree.GetListOfBranches()
		branch_name_list = [d.GetName() for d in branch_list]
		branch_name = typ_name+'_BEST_'
		if any([branch_name in s for s in branch_name_list]):
			inputtree.SetBranchStatus(branch_name+"*",0)
			
		newfile = ROOT.TFile(DumpFile.split('.root')[0]+'_tmp.root','RECREATE')
		newtree = inputtree.CloneTree(0)
		
		dict_Leaves = {}
		branch_name = typ_name+'_BEST_'+best_mva_name
		dict_Leaves[branch_name] = array('d',[0])
		newtree.Branch(branch_name, dict_Leaves[branch_name], branch_name + "/D")
		
		
		log.info('%s: Starting to process the output tree' %typ_name)
		nEntries = inputtree.GetEntries()
		for i in range(nEntries):
			if i%1000 == 0: log.info('Processing event %s/%s (%s%.2f%s%%)' %(i,nEntries,Fore.GREEN,100*float(i)/float(nEntries),Fore.WHITE))
			inputtree.GetEntry(i)
			for key,value in dict_Discriminators.iteritems():
				dict_Leaves[key][0] = value[i]
			newtree.Fill()

		newtree.Write()
		newfile.Close()
		inputfile.Close()
		
		os.system('cp %s %s'%(DumpFile.split('.root')[0]+'_tmp.root',DumpFile))
		os.system('rm %s'%DumpFile.split('.root')[0]+'_tmp.root')

		log.info('Done: output file dumped in %s' %DumpFile) 

	if FoM == "AUC": return max(AUC_tmp.iteritems(), key=itemgetter(1))[0],Classifiers[max(AUC_tmp.iteritems(), key=itemgetter(1))[0]][0]
	elif FoM == "OOP": return max(OOP_tmp.iteritems(), key=itemgetter(1))[0],Classifiers[max(OOP_tmp.iteritems(), key=itemgetter(1))[0]][0]
	elif FoM == "PUR": return max(PUR_tmp.iteritems(), key=itemgetter(1))[0], Classifiers[max(PUR_tmp.iteritems(), key=itemgetter(1))[0]][0]
	elif FoM == "ACC": return max(ACC_tmp.iteritems(), key=itemgetter(1))[0],Classifiers[max(ACC_tmp.iteritems(), key=itemgetter(1))[0]][0]

Example #57

0

Show file

File: sklearn_training_GBT.py Project: kderoove/CSVv2ScikitLearn

    log.info("processing file %s for training" % fname)
    with io.root_open(fname) as tfile:
        match = fname_regex.match(fname)
        if not match:
            raise ValueError("Could not match the regex to the file %s" % fname)
        flavor = match.group("flavor")
        full_category = match.group("category")
        category = [i for i in sv_categories if i in full_category][0]
        #      if flavor == 'C':
        #      	log.info('Jet_flavour %s is not considered signal or background in this training and is omitted' % flavor)
        # 	continue

        nfiles_per_sample = None
        skip_n_events = 2  # put this to 1 to include all the events
        tree = rootnp.root2array(fname, "ttree", variables, None, 0, nfiles_per_sample, skip_n_events, False, "weight")
        tree = rootnp.rec2array(tree)
        X = np.concatenate((X, tree), 0)
        if flavor == "B":
            y = np.concatenate((y, np.ones(tree.shape[0])))
            weight_B = np.empty(tree.shape[0])
            weight_B.fill(2)
            weights_flavour = np.concatenate((weights_flavour, weight_B))
        elif flavor == "C":
            y = np.concatenate((y, np.zeros(tree.shape[0])))
            weight_C = np.empty(tree.shape[0])
            weight_C.fill(1)
            weights_flavour = np.concatenate((weights_flavour, weight_C))
        else:
            y = np.concatenate((y, np.zeros(tree.shape[0])))
            weight_DUSG = np.empty(tree.shape[0])
            weight_DUSG.fill(3)

Example #58

0

Show file

File: sample.py Project: sagittaeri/htt

    def draw_array_helper(self, field_hist, category, region,
                          cuts=None,
                          weighted=True,
                          field_scale=None,
                          weight_hist=None,
                          scores=None,
                          clf=None,
                          min_score=None,
                          max_score=None,
                          systematic='NOMINAL',
                          bootstrap_data=False):

        from .data import Data, DataInfo

        all_fields = []
        classifiers = []
        for f in field_hist.iterkeys():
            if isinstance(f, basestring):
                all_fields.append(f)
            elif isinstance(f, Classifier):
                classifiers.append(f)
            else:
                all_fields.extend(list(f))
        if len(classifiers) > 1:
            raise RuntimeError(
                "more than one classifier in fields is not supported")
        elif len(classifiers) == 1:
            classifier = classifiers[0]
        else:
            classifier = None

        if isinstance(self, Data) and bootstrap_data:
            log.info("using bootstrapped data")
            analysis = bootstrap_data
            recs = []
            scores = []
            for s in analysis.backgrounds:
                rec = s.merged_records(category, region,
                    fields=all_fields, cuts=cuts,
                    include_weight=True,
                    clf=clf,
                    systematic=systematic)
                recs.append(rec)
            b_rec = stack(recs, fields=all_fields + ['classifier', 'weight'])
            s_rec = analysis.higgs_125.merged_records(category, region,
                fields=all_fields, cuts=cuts,
                include_weight=True,
                clf=clf,
                systematic=systematic)

            # handle negative weights separately
            b_neg = b_rec[b_rec['weight'] < 0]
            b_pos = b_rec[b_rec['weight'] >= 0]

            def bootstrap(rec):
                prob = np.abs(rec['weight'])
                prob = prob / prob.sum()
                # random sample without replacement
                log.warning(str(int(round(abs(rec['weight'].sum())))))
                sample_idx = np.random.choice(
                    rec.shape[0], size=int(round(abs(rec['weight'].sum()))),
                    replace=False, p=prob)
                return rec[sample_idx]

            rec = stack([
                bootstrap(b_neg),
                bootstrap(b_pos),
                bootstrap(s_rec)],
                fields=all_fields + ['classifier', 'weight'])

            rec['weight'][:] = 1.
            scores = rec['classifier']
        else:
            # TODO: only get unblinded vars
            rec = self.merged_records(category, region,
                fields=all_fields, cuts=cuts,
                include_weight=True,
                clf=classifier,
                systematic=systematic)

        if isinstance(scores, tuple):
            # sanity
            #assert (scores[1] == rec['weight']).all()
            # ignore the score weights since they should be the same as the rec
            # weights
            scores = scores[0]

        if weight_hist is not None and scores is not None:
            log.warning("applying a weight histogram")
            edges = np.array(list(weight_hist.xedges()))
            # handle strange cases
            edges[0] -= 1E10
            edges[-1] += 1E10
            weights = np.array(list(weight_hist.y())).take(
                edges.searchsorted(scores) - 1)
            weights = rec['weight'] * weights
        else:
            weights = rec['weight']

        if scores is not None:
            if min_score is not None:
                idx = scores > min_score
                rec = rec[idx]
                weights = weights[idx]
                scores = scores[idx]

            if max_score is not None:
                idx = scores < max_score
                rec = rec[idx]
                weights = weights[idx]
                scores = scores[idx]

        for fields, hist in field_hist.items():
            if isinstance(fields, Classifier):
                fields = ['classifier']
            # fields can be a single field or list of fields
            elif not isinstance(fields, (list, tuple)):
                fields = [fields]
            if hist is None:
                # this var might be blinded
                continue
            # defensive copy
            if isinstance(fields, tuple):
                # select columns in numpy recarray with a list
                fields = list(fields)
            arr = np.copy(rec[fields])
            if field_scale is not None:
                for field in fields:
                    if field in field_scale:
                        arr[field] *= field_scale[field]
            # convert to array
            arr = rec2array(arr, fields=fields)
            # HACK HACK HACK
            _weights = weights
            if fields == ['dEta_jets']:
                log.warning("HACK HACK")
                nonzero = arr > 0
                arr = arr[nonzero]
                _weights = weights[nonzero]
            # include the scores if the histogram dimensionality allows
            if scores is not None and hist.GetDimension() == len(fields) + 1:
                arr = np.c_[arr, scores]
            elif hist.GetDimension() != len(fields):
                raise TypeError(
                    'histogram dimensionality does not match '
                    'number of fields: %s' % (', '.join(fields)))
            hist.fill_array(arr, weights=_weights)
            if isinstance(self, Data):
                if hasattr(hist, 'datainfo'):
                    hist.datainfo += self.info
                else:
                    hist.datainfo = DataInfo(self.info.lumi, self.info.energies)

Example #59

0

Show file

File: plot_bootstrap.py Project: S-Bahrasemani/rootpy

for bootstrap_idx in range(100):
    sys.stdout.write("bootstrap {0} ...\r".format(bootstrap_idx))
    sys.stdout.flush()
    # resample with replacement
    # http://docs.scipy.org/doc/numpy-dev/reference/generated/numpy.random.choice.html
    sample_idx = np.random.choice(len(array), size=len(array), replace=True)
    array_bootstrapped = array[sample_idx]
    # convert back to a TTree and write it out
    tree_bootstrapped = array2tree(
        array_bootstrapped,
        name='bootstrap_{0}'.format(bootstrap_idx))
    tree_bootstrapped.Write()
    tree_bootstrapped.Delete()
    # fill the ROOT histogram with the numpy array
    hist.Reset()
    fill_hist(hist, rec2array(array_bootstrapped))
    hist.Draw()
    hist.xaxis.title = 'x'
    hist.yaxis.title = 'y'
    hist.zaxis.title = 'Events'
    hist.xaxis.limits = (-2.5, 2.5)
    hist.yaxis.limits = (-2.5, 2.5)
    hist.zaxis.range_user = (0, 60)
    hist.xaxis.divisions = 5
    hist.yaxis.divisions = 5
    hist.zaxis.divisions = 5
    canvas.Print('bootstrap.gif+50')

# loop the gif
canvas.Print('bootstrap.gif++')
output.Close()