def saveTree(processPath,dictVar,vector,MVAVector=None,SF=1,nameTree="reducedTree"): from root_numpy import array2root i=0 for key in dictVar.keys(): if i == 0: writeMode='recreate' i=1 else: writeMode='update' v=(np.asarray(vector[:,dictVar[key]])) name = key if key == 'diphotonCandidate.M()': name = 'Mgg' elif key == 'dijetCandidate.M()': name = 'Mjj' #elif key == 'HHTagger2017': # name = 'MVAOutput' if SF != 1: if key == 'weight': v = (np.multiply(np.asarray(v),SF)) v.dtype = [(name.replace(".","").replace("(","").replace(")","").replace("/","_Over_").replace("_","").replace("Candidate",""), np.float64)] array2root(v, processPath, nameTree, mode = writeMode) if MVAVector is not None: v=(np.asarray(MVAVector.ravel())) v.dtype = [('MVAOutput', np.float64)] array2root(v, processPath, nameTree, mode ='update')
def test_array2tree_fixed_length_arrays(): f = load(['fixed1.root', 'fixed2.root']) a = rnp.root2array(f) with temp() as tmp: rnp.array2root(a, tmp.GetName(), mode='recreate') a_conv = rnp.root2array(tmp.GetName()) assert_array_equal(a, a_conv)
def write_prediction_to_file(features, scaler, model, filename='', treename='', branch=''): data_out = features features = features[:, 0:NDIM] features = scaler.transform(features) y_predict_all = model.predict(features) # normal numpy array data_out = numpy.concatenate((data_out, y_predict_all), axis=1) dtype = numpy.dtype([('layer', numpy.float32), ('n1', numpy.float32), ('n2', numpy.float32), ('n3', numpy.float32), ('n4', numpy.float32), ('n5', numpy.float32), ('n6', numpy.float32), ('un1', numpy.float32), ('un2', numpy.float32), ('un3', numpy.float32), ('un4', numpy.float32), ('un5', numpy.float32), ('un6', numpy.float32), ('dn1', numpy.float32), ('dn2', numpy.float32), ('dn3', numpy.float32), ('dn4', numpy.float32), ('dn5', numpy.float32), ('dn6', numpy.float32), ('nup', numpy.float32), ('ndown', numpy.float32), ('event', numpy.float32), ('rechitsum', numpy.float32), ('rechit', numpy.float32), ('rechit_dnn', numpy.float32)]) data_out = numpy.core.records.fromarrays( data_out.transpose(), dtype=dtype) # structured numpy array root_numpy.array2root(data_out, filename, treename=treename, mode='recreate')
def writeOutPrediction(self, predicted, features, truth, weights, outfilename, inputfile): # predicted will be a list spectator_branches = ['jet_pt', 'jet_eta'] from root_numpy import array2root if inputfile[-5:] == 'djctd': print( "storing normed pt and eta... run on root files if you want something else for now" ) spectators = features[0][:, 0:2].transpose() else: import uproot3 as uproot print(inputfile) urfile = uproot.open(inputfile)["deepntuplizer/tree"] spectator_arrays = urfile.arrays(spectator_branches) print(spectator_arrays) spectators = [ spectator_arrays[a.encode()] for a in spectator_branches ] out = np.core.records.fromarrays( np.vstack( (predicted[0].transpose(), truth[0].transpose(), spectators)), names= 'prob_isB, prob_isBB, prob_isC,prob_isUDSG,isB, isBB, isC,isUDSG,jet_pt, jet_eta' ) array2root(out, outfilename, 'tree')
def _predict(args): data_iter = data_loader(args) preds = model.predict(data_iter).asnumpy() truths = data_iter.get_truths() observers = data_iter.get_observers() print(preds.shape, truths.shape, observers.shape) pred_output = {} for i, label in enumerate(data_iter._data_format.class_labels): pred_output['class_%s' % label] = truths[:, i] pred_output['score_%s' % label] = preds[:, i] for i, obs in enumerate(data_iter._data_format.obs_vars): pred_output[obs] = observers[:, i] import pandas as pd df = pd.DataFrame(pred_output) if args.predict_output: logging.info('Write prediction file to %s' % args.predict_output) outdir = os.path.dirname(args.predict_output) if not os.path.exists(outdir): os.makedirs(outdir) df.to_hdf(args.predict_output, 'Events', format='table') from common.util import plotROC plotROC(preds, truths, output=os.path.join(outdir, 'roc.pdf')) from root_numpy import array2root array2root(df.to_records(index=False), filename=args.predict_output.rsplit('.', 1)[0] + '.root', treename='Events', mode='RECREATE')
def writeOutPrediction(self, predicted, features, truth, weights, outfilename, inputfile): # predicted is a list print("Prediction started") from root_numpy import array2root namesstring = 'prob_isPrompt, prob_isNonPrompt, prob_isFake, prob_lep_isFromSUSYandHF,' #prob_isFromSUSY, prob_isFromSUSYHF,' for label in self.truth_branches: namesstring += label + ', ' features_string = ', '.join(self.global_branches) namesstring += features_string out = np.core.records.fromarrays(np.vstack( (predicted[0].transpose(), truth[0].transpose(), features[0][:, :].transpose())), names=namesstring) # if one predicts on a DataCollection one has to change # the file extension to .root if not outfilename.endswith(".root"): print("Predicted from a DataCollection make root files") print("Note that outfiles files extensions must be adapted...") filename, _ = os.path.splitext(outfilename) outfilename = filename + ".root" print("making {}".format(outfilename.split('/')[-1])) array2root(out, outfilename, 'tree')
def save_data(self, metadata, data): if self.treemaker.uses_arrays: # Activate Joey's array saving code dataframe_to_root(data, self.path, treename=self.treemaker.__name__, mode='recreate') else: # Check we really aren't using arrays, otherwise we'll crash with a very uninformative message for branch_name in data.columns: if is_array_field(data, branch_name): raise TypeError( "Column %s is an array field, and you want to save to root. Either " "(1) use MultipleRowExtractor-based minitrees; or " "(2) add a uses_arrays=True attribute to the %s class; or " "(3) use pickle as your minitree format." % (branch_name, self.treemaker.__class__.__name__)) root_numpy.array2root(data.to_records(), self.path, treename=self.treemaker.__name__, mode='recreate') # Add metadata as JSON in a TNamed in the same ROOT file bla = ROOT.TNamed('metadata', json.dumps(metadata)) minitree_f = ROOT.TFile(self.path, 'UPDATE') bla.Write() minitree_f.Close()
def run(name, source, quick=False): print time.asctime(time.localtime()), "Filling BDT Branches" branch_names = joblib.load("pickle/variables.pkl") if quick == True: signal = joblib.load('pickle/all_signalq.pkl') clf = joblib.load("pickle/" + name + "quick.pkl") else: signal = joblib.load('pickle/all_signal.pkl') clf = joblib.load("pickle/" + name + ".pkl") # predict and write probability of each MC event being signal bdt_MC_predicted = clf.predict_proba(signal) bdt_MC_predicted.dtype = [('GradBoost_prob', np.float64)] array2root((np.hsplit(bdt_MC_predicted,2)[1]), "/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA_BDT.root", "DecayTree") # predict and write probability of every data event being signal all_data = root2array("/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree", branch_names) all_data = rec2array(all_data) bdt_data_predicted = clf.predict_proba(all_data) bdt_data_predicted.dtype = [('GradBoost_prob', np.float64)] array2root((np.hsplit(bdt_data_predicted,2)[1]), "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree") print time.asctime(time.localtime()), "Branches Filled!"
def _predict(self, X, features_names=None, model_type=('classification', None)): """ Predict data :param pandas.DataFrame X: data shape [n_samples, n_features] :return: predicted values of shape n_samples """ self._check_fitted() directory = self._create_tmp_directory() try: with tempfile.NamedTemporaryFile(mode="w", suffix='.xml', dir=directory, delete=True) as file_xml: file_xml.write(self.formula_xml) file_xml.flush() add_info = _AdditionalInformationPredict(directory, file_xml.name, features_names, self._method_name, model_type=model_type) root_numpy.array2root(X.astype(numpy.float32).to_records(), filename=add_info.filename, treename=add_info.treename) prediction = self._run_tmva_predict(add_info) finally: self._remove_tmp_directory(directory) return prediction
def _fit(self, X, y, sample_weight=None, features_names=None, model_type='classification'): """ Train the classifier :param pandas.DataFrame X: data shape [n_samples, n_features] :param list | numpy.array y: values - array-like of shape [n_samples] :param list | numpy.array sample_weight: weight of events, array-like of shape [n_samples] or None if all weights are equal :return: self """ # saving data to 2 different root files. directory = self._create_tmp_directory() add_info = _AdditionalInformation(directory, features_names, model_type=model_type) try: X[add_info.weight_column] = sample_weight X[add_info.target_column] = y root_numpy.array2root(X.to_records(), filename=add_info.filename, treename=add_info.treename) self._run_tmva_training(add_info) finally: self._remove_tmp_directory(directory) return self
def writeOutPrediction(self, predicted, features, truth, weights, outfilename, inputfile): # predicted will be a list from root_numpy import array2root out = np.core.records.fromarrays(np.vstack( (predicted[0].transpose(),truth[0].transpose(), features[0][:,0:2].transpose() ) ), names='prob_isPrompt, prob_isNonPrompt, prob_isFake, lep_pt, lep_eta') array2root(out, outfilename, 'tree')
def run(name, source, quick=False): print time.asctime(time.localtime()), "Filling BDT Branches" branch_names = joblib.load("pickle/variables.pkl") if quick == True: signal = joblib.load('pickle/all_signalq.pkl') clf = joblib.load("pickle/" + name + "quick.pkl") else: signal = joblib.load('pickle/all_signal.pkl') clf = joblib.load("pickle/" + name + ".pkl") # predict and write probability of each MC event being signal bdt_MC_predicted = clf.predict_proba(signal) bdt_MC_predicted.dtype = [('GradBoost_prob', np.float64)] array2root(( np.hsplit(bdt_MC_predicted, 2)[1] ), "/net/storage03/data/users/dlafferty/NTuples/SignalMC/2012/combined/Bs2phiphi_MC_2012_combined_corrected_TupleA_BDT.root", "DecayTree") # predict and write probability of every data event being signal all_data = root2array( "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree", branch_names) all_data = rec2array(all_data) bdt_data_predicted = clf.predict_proba(all_data) bdt_data_predicted.dtype = [('GradBoost_prob', np.float64)] array2root(( np.hsplit(bdt_data_predicted, 2)[1] ), "/net/storage03/data/users/dlafferty/NTuples/data/2012/combined/Bs2phiphi_data_2012_corrected_TupleA_BDT.root", "DecayTree") print time.asctime(time.localtime()), "Branches Filled!"
def SaveToRoot(self,df,path_output,output_name=None,out_idx=''): # Get the unique samples as a list # if output_name is None: sample_list = list(df[parameters.split_name].unique()) # Loop over samples # for sample in sample_list: sample_df = df.loc[df[parameters.split_name]==sample] # We select the rows corresponding to this sample # Remove tag and sample name (info in target as bool) # sample_df = sample_df.drop('tag',axis=1) sample_df = sample_df.drop('sample',axis=1) # From df to numpy array with dtype # sample_output = sample_df.to_records(index=False,column_dtypes='float64') sample_output.dtype.names = parameters.make_dtype(sample_output.dtype.names)# because ( ) and . are an issue for root_numpy sample_output_name = os.path.join(path_output,sample+out_idx+'.root') # Save as root file # array2root(sample_output,sample_output_name,mode='recreate') logging.info('Output saved as : '+sample_output_name) else: # From df to numpy array with dtype # full_output = df.to_records(index=False,column_dtypes='float64') full_output.dtype.names = parameters.make_dtype(full_output.dtype.names)# because ( ) and . are an issue for root_numpy full_output_name = os.path.join(path_output,output_name) array2root(full_output,full_output_name,mode='recreate') logging.info('Output saved as : '+full_output_name)
def writeOutPrediction(self, predicted, features, truth, weights, outfilename, inputfile): # predicted will be a list print('writeout') print('predicted', predicted[0].shape) print('features', features[0].shape) print('truth', truth[0].shape) def unroll(a): a = np.reshape(a, [a.shape[0], a.shape[1] * a.shape[2], a.shape[3]]) return a #unroll to event x vector # first 100 are enough for now parr = predicted[0][:100, ...] #unroll(predicted[0]) farr = features[0][:100, ...] #unroll(features[0]) tarr = truth[0][:100, ...] #unroll(truth[0]) from DeepJetCore.TrainData import TrainData #use traindata as data storage td = TrainData() td._store([parr, farr, tarr], [], []) td.writeToFile(outfilename) return from root_numpy import array2root out = np.core.records.fromarrays( [ parr[:, :, 0], parr[:, :, 1], parr[:, :, 2], parr[:, :, 3], parr[:, :, 4], parr[:, :, 5], parr[:, :, 6], parr[:, :, 7], parr[:, :, 9], parr[:, :, 10], tarr[:, :, 0], tarr[:, :, 1], tarr[:, :, 2], tarr[:, :, 3], tarr[:, :, 4], tarr[:, :, 5], tarr[:, :, 6], tarr[:, :, 7], farr[:, :, 0], farr[:, :, 1], farr[:, :, 2], farr[:, :, 3], farr[:, :, 4], ], names= 'p_beta, p_posx, p_posy, p_ID0, p_ID1, p_ID2, p_dim1, p_dim2, p_ccoords1, p_coords2, t_mask, t_posx, t_posy, t_ID0, t_ID1, tID_2, t_dim1, t_dim2, f_r, f_g, f_b, f_x, f_y' ) array2root(out, outfilename, 'tree') '''
def write_tree(self): array = [] all_branches = default_branches + ["label", "train_id" ] + self.additional_branches for aux in all_branches: print aux a = self.append_arrays(aux) if aux == "label" or aux == "train_id" or "dnn_score" in aux: tree_name = aux elif "evt_weight" in aux: tree_name = "weight" else: tree_name = aux[:-1] a_ = self.create_structured_array(a, tree_name) array.append(a_) for mva in self.mva_helpers.keys(): a = [ y for x in [ self.mva_helpers[mva]["prediction"]["train"], self.mva_helpers[mva]["prediction"]["test"], self.mva_helpers[mva]["prediction"]["data"] ] for y in x ] a_ = self.create_structured_array(a, mva) array.append(a_) merged_array = rfn.merge_arrays(array, flatten=True, usemask=False) self.output_root = self.output + ".root" os.system("rm %s" % self.output_root) root_numpy.array2root(merged_array, self.output_root, treename="t")
def test_drop_nonscalar_columns(): array = np.array([1, 2, 3]) matrix = np.array([[1, 2, 3], [4, 5, 6]]) bool_matrix = np.array([[True, False, True], [True, True, True]]) dt = np.dtype([ ('a', 'i4'), ('b', 'int64', array.shape), ('c', 'int64', matrix.shape), ('d', 'bool_'), ('e', 'bool_', matrix.shape) ]) arr = np.array([ (3, array, matrix, True, bool_matrix), (2, array, matrix, False, bool_matrix)], dtype=dt) path = 'tmp.root' array2root(arr, path, 'ntuple', mode='recreate') df = read_root(path, flatten=False) # the above line throws an error if flatten=True because nonscalar columns # are dropped only after the flattening is applied. However, the flattening # algorithm can not deal with arrays of more than one dimension. assert(len(df.columns) == 2) assert(np.all(df.index.values == np.array([0, 1]))) assert(np.all(df.a.values == np.array([3, 2]))) assert(np.all(df.d.values == np.array([True, False]))) os.remove(path)
def test_nonscalar_columns(): array = np.array([1, 2, 3], dtype=np.int64) matrix = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64) bool_matrix = np.array([[True, False, True], [True, True, True]], dtype=np.bool_) dt = np.dtype([('a', 'i4'), ('b', 'int64', array.shape), ('c', 'int64', matrix.shape), ('d', 'bool_'), ('e', 'bool_', matrix.shape)]) arr = np.array([(3, array, matrix, True, bool_matrix), (2, array, matrix, False, bool_matrix)], dtype=dt) reference_df = pd.DataFrame() reference_df['a'] = np.array([3, 2], dtype=np.int32) reference_df['b'] = to_object_array([array, array]) reference_df['c'] = to_object_array([matrix, matrix]) reference_df['d'] = np.array([True, False], dtype=np.bool_) reference_df['e'] = to_object_array([bool_matrix, bool_matrix]) path = 'tmp.root' array2root(arr, path, 'ntuple', mode='recreate') df = read_root(path, flatten=False) assert_frame_equal(df, reference_df) os.remove(path)
def to_root(df, path, tree_key="default", mode='w', *kargs, **kwargs): """ Write DataFrame to a ROOT file. Parameters ---------- path: string File path to new ROOT file (will be overwritten) tree_key: string Name of tree that the DataFrame will be saved as mode: string, {'w', 'a'} Mode that the file should be opened in (default: 'w') Notes ----- Further *kargs and *kwargs are passed to root_numpy's array2root. >>> df = DataFrame({'x': [1,2,3], 'y': [4,5,6]}) >>> df.to_root('test.root') The DataFrame index will be saved as a branch called 'index'. """ if mode == 'a': mode = 'update' elif mode == 'w': mode = 'recreate' else: raise ValueError('Unknown mode: {}. Must be "a" or "w".'.format(mode)) from root_numpy import array2root arr = df.to_records() array2root(arr, path, tree_key, mode=mode, *kargs, **kwargs)
def prepareOutput(outputD, ll, plotsD, rootFile, NN_MVA): NN_Output = h5py.File("%sNN_Output_applied_%s.h5"%(outputD,ll), "r+") mZ_x, mZ_y = (NN_Output['MET_GroundTruth'][:,0]), (NN_Output['MET_GroundTruth'][:,1]) a_x, a_y = (NN_Output['MET_Predictions'][:,0]), (NN_Output['MET_Predictions'][:,1]) mZ_r, mZ_phi = kar2pol(mZ_x, mZ_y) mZ_r = NN_Output['Boson_Pt'][:] a_r, a_phi = kar2pol(a_x, a_y) NN_LongZ, NN_PerpZ = -np.cos(angularrange(np.add(a_phi,-mZ_phi)))*a_r, np.sin(angularrange(a_phi-mZ_phi))*a_r #HDF5 dset = NN_MVA.create_dataset("NN_LongZ", dtype='d', data=NN_LongZ) dset1 = NN_MVA.create_dataset("NN_PerpZ", dtype='d', data=NN_PerpZ) dset2 = NN_MVA.create_dataset("NN_Phi", dtype='d', data=a_phi) dset3 = NN_MVA.create_dataset("NN_Pt", dtype='d', data=a_r) dset4 = NN_MVA.create_dataset("Boson_Pt", dtype='d', data=mZ_r) dset5 = NN_MVA.create_dataset("NN_x", dtype='d', data=a_x) dset6 = NN_MVA.create_dataset("NN_y", dtype='d', data=a_y) dset7 = NN_MVA.create_dataset("Boson_x", dtype='d', data=mZ_x) dset8 = NN_MVA.create_dataset("Boson_y", dtype='d', data=mZ_y) dset9 = NN_MVA.create_dataset("Boson_Phi", dtype='d', data=mZ_phi) NN_MVA.close() #Root #treename = ll+"_nominal/ntuple" #Root_array = rnp.root2array(rootFile, treename=treename) #print("shape Root_array", Root_array.shape) NN_array = np.array([(a_r[i], a_phi[i], a_x[i], a_y[i], NN_LongZ[i], NN_PerpZ[i]) for i in range(len(a_r))], dtype=[('NN_Pt', np.float32), ('NN_Phi', np.float32), ('NN_x', np.float32), ('NN_y', np.float32), ('NN_LongZ', np.float32), ('NN_PerpZ', np.float32)]) print("shape NN_array", NN_array.shape) rnp.array2root(NN_array, "%s/NN_MVA_%s.root"%(outputDir,ll), mode='recreate')
def makePrediction(self, model, testdatacollection, outputDir, ident=''): import numpy as np from root_numpy import array2root import os outputDir = os.path.abspath(outputDir) if len(ident) > 0: ident = '_' + ident self.__sourceroots = [] self.__predictroots = [] self.metrics = [] for i in range(len(testdatacollection.samples)): sample = testdatacollection.samples[i] originroot = testdatacollection.originRoots[i] outrootfilename = os.path.basename(originroot).split( '.')[0] + '_predict' + ident + '.root' fullpath = testdatacollection.getSamplePath(sample) td = testdatacollection.dataclass td.readIn(fullpath) truthclasses = td.getUsedTruth() regressionclasses = td.regressiontargetclasses formatstring = ','.join( ['prob_%s%s' % (i, ident) for i in truthclasses]) features = td.x labels = td.y #metric=model.evaluate(features, labels, batch_size=10000) prediction = model.predict(features) if isinstance(prediction, list): ######CHANGE FOR NEW FORMAT formatstring += ',' formatstring += ','.join( ['reg_%s%s' % (i, ident) for i in regressionclasses]) all_write = np.concatenate(prediction, axis=1) elif prediction.shape[1] == len(truthclasses): all_write = prediction else: raise ValueError( 'Regression (2nd prediction output) can only have up to two values!' ) all_write = np.core.records.fromarrays(np.transpose(all_write), names=formatstring) array2root(all_write, outputDir + '/' + outrootfilename, "tree", mode="recreate") #self.metrics.append(metric) self.__sourceroots.append(originroot) self.__predictroots.append(outputDir + '/' + outrootfilename) print(formatstring) print('\ncreated predition friend tree ' + outputDir + '/' + outrootfilename + ' for ' + originroot)
def getArrayToRoot(rec_array, foutname, treename): """ Convert and return a tree into a numpy array Inputs: TTree object """ from root_numpy import array2root info('(getArrayToRoot) building tree %s in file %s' % (treename, foutname)) array2root(rec_array, foutname, treename)
def write_output_tree(allparticles, outputFile): from root_numpy import array2root out = np.core.records.fromarrays( allparticles.transpose(), names= "is_reco, reco_posx, reco_posy, reco_e, is_true, true_posx, true_posy, true_e, true_id, n_true" ) array2root(out, outputFile + ".root", 'tree')
def test_array2tree_charstar(): a = np.array([b'', b'a', b'ab', b'abc', b'xyz', b''], dtype=[('string', 'S3')]) with temp() as tmp: rnp.array2root(a, tmp.GetName(), mode='recreate') a_conv = rnp.root2array(tmp.GetName()) assert_array_equal(a, a_conv)
def _saveAsROOT(self): output = self.data.to_records(index=False, column_dtypes='float64') output.dtype.names = [(name.replace('.', 'p').replace('(', '').replace( ')', '').replace('-', '_minus_').replace('*', '_times_')) for name in output.dtype.names ] # root_numpy issues array2root(output, self.save_path, mode='recreate') print('Output saved as : ' + self.save_path)
def writeOutPrediction(self, predicted, features, truth, weights, outfilename, inputfile): # predicted will be a list from root_numpy import array2root out = np.core.records.fromarrays(predicted[0].transpose(), names='prob_p, prob_np, prob_f') array2root(out, outfilename, 'tree')
def add_to_rootfile(rootfile, new_branch, branch_name=None, overwrite=True): """Adds a new branch to a given root file. .. warning:: Overwrite not working currently! Parameters ---------- rootfile : root-dict The ROOT-file where the data should be added new_branch : numpy.array 1-D, list, root-dict A one-dimensional numpy array that contains the data. branch_name : str The name of the branche resp. the name in the dtype of the array. """ from root_numpy import root2array, array2tree from rootpy.io import root_open rootfile = dev_tool.entries_to_str(rootfile) new_branch = dev_tool.entries_to_str(new_branch) branch_name = dev_tool.entries_to_str(branch_name) # get the right parameters # TODO: what does that if there? an assertion maybe? write_mode = 'update' branch_name = 'new_branch1' if branch_name is None else branch_name if isinstance(rootfile, dict): filename = rootfile.get('filenames') treename = rootfile.get('treename') new_branch = to_ndarray(new_branch) # new_branch.dtype = [(branch_name, 'f8')] # write to ROOT-file write_to_root = False if os.path.isfile(filename): with root_open(filename, mode='a') as root_file: tree = getattr(root_file, treename) # test if not tree.has_branch(branch_name): write_to_root = True # array2tree(new_branch, tree=tree) # f.write("", TObject.kOverwrite) # overwrite, does not create friends else: write_mode = 'recreate' write_to_root = True if write_to_root: arr = np.core.records.fromarrays([new_branch], names=branch_name) array2root(arr=arr, filename=filename, treename=treename, mode=write_mode) return 0 else: return 1
def getHits_adu(self, runID, ohdu, adu_keV, save=False ): print( 'image2hit' ) path = self.outPath + self.partial_id simulatedImage_adu = self.getSimulatedImage_adu( adu_keV ) if not self.empty: simulatedImage_adu += self.reconstructSCNImage_adu( runID, ohdu ) hits_adu = self.convertImage2Hits( simulatedImage_adu, save = True ) if save: root_numpy.array2root(hits_adu, path, treename='hitSumm', mode='recreate') return hits_adu
def save_file(data, pred, proba, filename, model): data['isSignal'] = pred print(filename) data['probSignal'] = proba[:] array2root(np.array(data.to_records()), 'OutputRoot/new_' + model + '_' + filename, 'nominal', mode='recreate') print('Save file as {}'.format('new_' + model + '_' + filename)) return
def test_to_root(folder,result_folder,output_root_folder,variables,is_signal,model_label,sample_list=[]): if not os.path.isdir(output_root_folder+'/model_'+model_label): os.mkdir(output_root_folder+'/model_'+model_label) if sample_list==[]: print(" Empty sample list, will use full sample . . .") ##Read test sample store = pd.HDFStore(result_folder+'test_score_'+model_label+'.h5') df_test = store.select("df") for n, a in enumerate(var): back = np.array(df_test[a].loc[df_test[is_signal]==0].values, dtype=[(a, np.float64)]) sign = np.array(df_test[a].loc[df_test[is_signal]==1].values, dtype=[(a, np.float64)]) print(a," back: ", back) print(a," sign: ", sign) array2root(back, output_root_folder+'/model_'+model_label+'/test_bkg.root', mode='recreate' if n==0 else 'update') array2root(sign, output_root_folder+'/model_'+model_label+'/test_sgn.root', mode='recreate' if n==0 else 'update') print(" Signal and background root files written : ", output_root_folder+'/'+model_label+'/test_*.root') else: full_list = [] for sl in sample_list: full_list += samples[sl]['files'] for sample in full_list: ##Read test sample if not os.path.isfile(folder+sample+"_test.h5"): print("!!!File ", folder+sample+"_test.h5", " does not exist! Continuing") continue store = pd.HDFStore(result_folder+sample+"_score_"+model_label+".h5") df_test = store.select("df") newFile = TFile(output_root_folder+'/model_'+model_label+'/'+sample+'.root', 'recreate') newFile.cd() for n, a in enumerate(var): arr = np.array(df_test[a].values, dtype=[(a, np.float64)]) #print(a, " values: ", arr) #array2root(arr, output_root_folder+'/model_'+model_label+'/'+sample+'.root', mode='update')#mode='recreate' if n==0 else 'update') if n==0: skim = array2tree(arr) else: array2tree(arr, tree=skim)#mode='recreate' if n==0 else 'update') skim.Write() ##Recreate c_nEvents histogram counter = TH1F("c_nEvents", "Event Counter", 1, 0., 1.) counter.Sumw2() ##Fill counter histogram with the first entry of c_nEvents counter.Fill(0., df_test["c_nEvents"].values[0]) ##print("counter bin content: ", counter.GetBinContent(1)) counter.Write() newFile.Close() #counter.Delete() print(" Root file written : ", output_root_folder+'/model_'+model_label+'/'+sample+'.root')
def CreateTestSample(path, **kwargs): r"""Creates a :py:mod:`ROOT` file with toy data to be used for tests. The output file contains one tree with **nevents** number of entries represented by `nbranches` branches. Random numbers for each branch are drawn according to a chisquare distribution with a mean indicated by the branch index. The name of the output tree is given by **tree** and the branches are of the form 'branch_1', 'branch_2', ... Numbers are generated using the :class:`numpy.random` module and the output file is filled using the :func:`root_numpy.array2root` method. If a file with the same name already exists it will be overwritten (can be changed with the **overwrite** keyword argument). If **mkdir** is set to ``True`` (default: ``False``) directories in **path** with do not yet exist will be created automatically. :param path: path of output :py:mod:`ROOT` file :type path: ``str`` :param \**kwargs: see below :Keyword Arguments: * **nevents** (``int``) -- number of events in the output tree (default: 10000) * **nbranches** (``int``) -- number of branches (default: 10) * **tree** (``int``) -- name of the output tree (default: 'tree') * **overwrite** (``bool``) -- overwrite an existing file located at **path** (default: ``True``) * **mkdir** (``bool``) -- create non-existing directories in **path** (default: ``False``) """ basedir = os.path.abspath(path) if not basedir: logger.error("Directory '{}' does not exist!".format(basedir)) raise IOError("Path not found!") nevents = int(kwargs.get("nevents", 1e4)) nbranches = int(kwargs.get("nbranches", 10)) treename = kwargs.get("tree", "tree") array = np.core.records.fromarrays( np.transpose( np.random.chisquare(range(1, nbranches + 1, 1), size=(nevents, nbranches))), names=",".join( ["branch_{}".format(i + 1) for i in range(nbranches)]), ) rnp.array2root(array, path, treename=treename, mode="recreate") if os.path.isfile(path): logger.info("Created '{}'.".format(path))
def save_file(data, pred, proba, filename, model): data['isSignal'] = pred print(filename) #for index in range(20): # print "Proba {}".format(proba[index,0]) data['probSignal'] = proba[:, 0] array2root(np.array(data.to_records()), 'OutputRoot/new_BDT_' + model + '_' + filename, 'nominal', mode='recreate') return
def add_branch(arr, bname, rfile, tname): """ Add the passed array to an existing TTree in an existing TFile Args: arr (numpy.array): 1D numpy array that will be stored under a new branch bname (str): Branch name for the values rfile (str): Filename to which the new branch should be added tname (str): Name of the TTree to which the values should be added """ arr = np.array(arr, dtype=[(bname, np.find_common_type(arr, []))]) array2root(arr, rfile, treename=tname, mode='update')
def test_array2root(): a = np.array([ (12345, 2., 2.1, True), (3, 4., 4.2, False),], dtype=[ ('x', np.int32), ('y', np.float32), ('z', np.float64), ('w', np.bool)]) tmp_fd, tmp_path = tempfile.mkstemp(suffix='.root') rnp.array2root(a, tmp_path, mode='recreate') os.close(tmp_fd) os.remove(tmp_path)
def test_array2root(): a = np.array([ (12345, 2., 2.1, True), (3, 4., 4.2, False),], dtype=[ ('x', np.int32), ('y', np.float32), ('z', np.float64), ('w', np.bool)]) with temp() as tmp: rnp.array2root(a, tmp.GetName(), mode='recreate') a_conv = rnp.root2array(tmp.GetName()) assert_array_equal(a, a_conv) # extend the tree rnp.array2root(a, tmp.GetName(), mode='update') a_conv2 = rnp.root2array(tmp.GetName()) assert_array_equal(np.hstack([a, a]), a_conv2)
def test_array2root(): a = np.array([ (12345, 2., 2.1, True), (3, 4., 4.2, False),], dtype=[ ('x', np.int32), ('y', np.float32), ('z', np.float64), ('w', np.bool)]) tmp_fd, tmp_path = tempfile.mkstemp(suffix='.root') rnp.array2root(a, tmp_path, mode='recreate') a_conv = rnp.root2array(tmp_path) assert_array_equal(a, a_conv) # extend the tree rnp.array2root(a, tmp_path, mode='update') a_conv2 = rnp.root2array(tmp_path) assert_array_equal(np.hstack([a, a]), a_conv2) os.close(tmp_fd) os.remove(tmp_path)
def to_root(df, path, key='default', mode='w', *args, **kwargs): """ Write DataFrame to a ROOT file. Parameters ---------- path: string File path to new ROOT file (will be overwritten) key: string Name of tree that the DataFrame will be saved as mode: string, {'w', 'a'} Mode that the file should be opened in (default: 'w') Notes ----- Further *args and *kwargs are passed to root_numpy's array2root. >>> df = DataFrame({'x': [1,2,3], 'y': [4,5,6]}) >>> df.to_root('test.root') The DataFrame index will be saved as a branch called '__index__*', where * is the name of the index in the original DataFrame """ if mode == 'a': mode = 'update' elif mode == 'w': mode = 'recreate' else: raise ValueError('Unknown mode: {}. Must be "a" or "w".'.format(mode)) from root_numpy import array2root # We don't want to modify the user's DataFrame here, so we make a shallow copy df_ = df.copy(deep=False) name = df_.index.name if name is None: # Handle the case where the index has no name name = '' df_['__index__' + name] = df_.index arr = df_.to_records(index=False) array2root(arr, path, key, mode=mode, *args, **kwargs)
def test_array2root(): a = np.array([ (12345, 2., 2.1, True), (3, 4., 4.2, False),], dtype=[ ('x', np.int32), ('y', np.float32), ('z', np.float64), ('w', np.bool)]) with temp() as tmp: rnp.array2root(a, tmp.GetName(), mode='recreate') a_conv = rnp.root2array(tmp.GetName()) assert_array_equal(a, a_conv) # extend the tree rnp.array2root(a, tmp.GetName(), mode='update') a_conv2 = rnp.root2array(tmp.GetName()) assert_array_equal(np.hstack([a, a]), a_conv2) # write into subdirectory tname = 'root/sub/tree' rnp.array2root(a, tmp.GetName(), treename=tname, mode='update') a_conv3 = rnp.root2array(tmp.GetName(), treename=tname) assert_array_equal(a, a_conv3) # try creating tree with conflicting name assert_raises(IOError, rnp.array2root, a, tmp.GetName(), treename='root/sub', mode='update') # try creating subdirectory with conflicting name assert_raises(IOError, rnp.array2root, a, tmp.GetName(), treename='root/sub/tree/error', mode='update')
def save_data(self, metadata, data): if self.treemaker.uses_arrays: # Activate Joey's array saving code dataframe_to_root(data, self.path, treename=self.treemaker.__name__, mode='recreate') else: # Check we really aren't using arrays, otherwise we'll crash with a very uninformative message for branch_name in data.columns: if is_array_field(data, branch_name): raise TypeError("Column %s is an array field, and you want to save to root. Either " "(1) use MultipleRowExtractor-based minitrees; or " "(2) add a uses_arrays=True attribute to the %s class; or " "(3) use pickle as your minitree format." % (branch_name, self.treemaker.__class__.__name__)) root_numpy.array2root(data.to_records(), self.path, treename=self.treemaker.__name__, mode='recreate') # Add metadata as JSON in a TNamed in the same ROOT file bla = ROOT.TNamed('metadata', json.dumps(metadata)) minitree_f = ROOT.TFile(self.path, 'UPDATE') bla.Write() minitree_f.Close()
def write_output(self,outfile_name,ROOT=True,pickle=False): ''' Converts outputed event data into re-usable data format, either ROOT or a pickled numpy record array Input: -outfile_name: string, name of outfile to write to (without extension) -ROOT: bool, True to write a ROOT file -pickle: bool, True to write a pickle file ''' # build record array of outputed event dicts # assume all dict's are the same structure dt=[] for item in self.events[0]: if type(self.events[0][item])==numpy.ndarray: dt+=[(item,type(self.events[0][item][0]),len(self.events[0][item]))] else: dt+=[(item,type(self.events[0][item]))] dt=numpy.dtype(dt) values=[tuple(each.values()) for each in self.events] out=numpy.zeros((len(self.events),),dtype=dt) out[:]=values # convert record array to root tree, write to file if ROOT: if self.logging: print "Creating file %s.root" % (outfile_name) root_numpy.array2root(out,'%s.root' % (outfile_name)) # write record array to pickle file if pickle: if self.logging: print "Creating file %s.pickle" % (outfile_name) f=open('%s.pickle' % (outfile_name),'w') pickle.dump(out,f) f.close()
print "GAMMA2 done..." root['kappa'] = bcc['KAPPA'] print "KAPPA done..." root['size'] = bcc['SIZE'] print "SIZE done..." root['eps1'] = bcc["EPSILON"][0:,0] print "EPSILON 1 done..." root['eps2'] = bcc["EPSILON"][0:,1] print "EPSILON 2 done..." root["mag"] = bcc["TMAG"][0:,2] print "TMAG done..." root["teps1"] = bcc["TE"][0:,0] print "TEPS1 done..." root["teps2"] =bcc["TE"][0:,1] print "TEPS2 done..." root["tra"] = bcc["TRA"] print "TRA done..." root["tdec"] = bcc["TDEC"] print "TDEC done..." root["tsize"] = bcc["TSIZE"] print "TSIZE done..." root["mu"] = bcc["MU"] print "All Done !" array2root(root,output,'bcc')
from root_numpy import root2array, rec2array, array2root bdt_file = '/lustre/cmswork/hh/mvas/xgboost/train_3CSVM_0.5sig_0.7bkg_weighted.pkl' branch_names = ["H1_pT", "H2_pT", "H1_dEta_abs", "H2_dEta_abs", "H1_dPhi_abs", "H2_dPhi_abs"] # compute bdt values bdt = joblib.load(bdt_file) for root_file in args.root_files: print "processing {}".format(root_file) # load vars data from ROOT data = root2array(root_file, args.tree_name, branch_names) data_bdt = bdt.predict_proba(rec2array(data[branch_names]))[:,1] # save to ROOT file data_bdt.dtype = [(args.bdt_name, np.float32)] array2root(data_bdt, root_file, "tree")
range=(min_value, max_value), label='Signal SM', **hist_params) areaSig = sum(np.diff(bins)*values) #print areaBKG, " ",areaBKG2 ," ",areaSig if n == 0 : plt.legend(loc='best') plt.title(feature) plt.savefig("Variables_"+subset+BKG+"_benchmarks_"+ext) plt.clf() """ ################################################################################# ### Define classifiers to test traindataset, valdataset = train_test_split(dataset, random_state=11, train_size=0.50) traindatasetmix, valdatasetmix = train_test_split(datasetmix, random_state=11, train_size=0.50) ################################################################################# arr = valdatasetmix.to_records() array2root(arr, outputCentral+"_AppliedToMixed"+typedata+".root" , 'tree', 'recreate') arr = dataset.to_records() array2root(arr, outputCentral+"_AppliedToPlain"+typedata+".root" , 'tree', 'recreate') if typedata=="Data": arr = dataset20.to_records() array2root(arr, outputCentral+"_AppliedTo20pOfPlain"+typedata+".root" , 'tree', 'recreate') # for ii in range(0,3): if ii==0 : train= trainFeaturesplot Var='All' if ii==1 : train= trainFeaturesObvious Var='Mass' if ii==2 :
import numpy as np import root_numpy as rnp import postprocessing.ttree import ROOT # Gets a TTree from a ROOT file. signal_file = ROOT.TFile("ROOT_data/signal1MTraining.root") signal_tree = signal_file.Get("h101;1") # Converts the TTree to a NumPy structured array. array = rnp.tree2array(signal_tree) # Generates three new branches to be added to the array. classifier_branch = postprocessing.ttree.name_classifier_branches( np.random.rand(247015, 3), ['ClassA', 'ClassB', 'ClassC']) # Attaches the new branches to the original array. together = postprocessing.ttree.join_struct_arrays([array, classifier_branch]) # Outputs the total array as a ROOT file. rnp.array2root(together, "ROOT_data/together.root")
import sys import numpy as np from root_numpy import array2root print sys.argv[1] # need to find out how many columns are in the file f = open(sys.argv[1]) l = f.readline() colcount = l.count(',') f.close() cols = np.linspace(1,colcount,colcount,dtype=int) data = np.genfromtxt(sys.argv[1],delimiter=',',names=True,usecols=cols) array2root(data, sys.argv[1].replace('.csv','.root'),'outputTree')
fname, "ttree", Output_variables, None, 0, nfiles_per_sample, skip_n_events, False, "weight" ) Output_tree = rootnp.rec2array(Output_tree) Output_tree_final = np.ndarray( (Output_tree.shape[0],), dtype=[ ("Jet_flavour", float), ("TagVarCSV_vertexCategory", float), ("Jet_pt", float), ("Jet_eta", float), ("Jet_CSVIVF", float), ("BDTG", float), ], ) # , buffer = np.array([1,2,3,4,5])) for idx, val in enumerate(BDTG): Output_tree_final[idx][0] = Output_tree[idx][0] Output_tree_final[idx][1] = Output_tree[idx][1] Output_tree_final[idx][2] = Output_tree[idx][2] Output_tree_final[idx][3] = Output_tree[idx][3] Output_tree_final[idx][4] = Output_tree[idx][4] Output_tree_final[idx][5] = BDTG[idx] Output_tree_final = Output_tree_final.view(np.recarray) tree = rootnp.array2root( Output_tree_final, "trainPlusBDTG_CombinedSV" + category + "_" + flavor + ".root", "ttree", "recreate" ) log.info("Output file dumped in trainPlusBDTG_CombinedSV" + category + "_" + flavor + ".root") log.info("done")
if events[i]["event"]!=currentevent: if cutType=="bumphunt": candidates.sort(key=lambda x:events[x]["tarChisq"],reverse=False) elif cutType=="vertexing": candidates.sort(key=lambda x:events[x]["bscChisq"],reverse=False) elif cutType=="none": candidates.sort(key=lambda x:events[x]["tarChisq"],reverse=False) else: raise Exception("invalid cut type") # ranked_candidates = sorted(candidates, key=lambda x:events[x][sortkey],reverse=highestBest) rank=1 for j in candidates: output[j]["nPass"]=len(candidates) output[j]["rank"]=rank rank+=1 del candidates[:] currentevent = events[i]["event"] if output[i]["cut"]!=0: candidates.append(i) if cutOutput: output = output[output["cut"]!=0] if onlyBest: output = output[output["rank"]==1] if onlyOnly: output = output[output["nPass"]==1] root_numpy.array2root(output,remainder[0],mode="recreate",treename="cut") #newtree=root_numpy.array2tree(output) #newtree.Scan()
def savedata(dt, dt_LE, basename, clf=None, dt_real=None): # Get the data to write dt_out = dt.data labels = dt.treenames + dt.w_varnames + m_weightnames # Also take care of the low energy data dt_LE_out = dt_LE.data # If we have real data, save as well if dt_real != None: dt_real_out = dt_real.data # if a classifier is passed, then add that to the data field if clf != None: scores = clf.decision_function(dt.getDataNoWeight()) scores = scores.reshape((len(scores),1)) dt_out = np.concatenate((dt_out, scores),axis=1) labels += ['score'] LE_scores = clf.decision_function(dt_LE.getDataNoWeight()) LE_scores = LE_scores.reshape((len(LE_scores),1)) dt_LE_out = np.concatenate((dt_LE_out, LE_scores),axis=1) if dt_real != None: real_scores = clf.decision_function(dt_real.getDataNoWeight()) real_scores = real_scores.reshape((len(real_scores),1)) dt_real_out = np.concatenate((dt_real_out,real_scores),1) csl = "" for i in range(len(labels)-1): csl += labels[i] + "," csl += labels[-1] # Separate the data into signal and background dt_out_sig = dt_out[ dt.targets > 0.5 ] dt_out_bkg = dt_out[ dt.targets < 0.5 ] # Turn into record array dt_out_sig = np.rec.fromrecords(dt_out_sig, names=csl) dt_out_bkg = np.rec.fromrecords(dt_out_bkg, names=csl) dt_LE_out = np.rec.fromrecords(dt_LE_out, names=csl) if dt_real != None: dt_real_out = np.rec.fromrecords(dt_real_out, names=csl) for wn in m_weightnames: dt_out_sig[wn] *= dt.sf dt_out_bkg[wn] *= dt.sf dt_LE_out[wn] *= dt_LE.sf # Convert directly to a root file signame = 'processed_trees/' + basename + '_sig.root' bkgname = 'processed_trees/' + basename + '_bkg.root' dataname = 'processed_trees/' + basename + '_data.root' array2root(dt_out_sig, signame, 'tree','recreate') array2root(dt_out_bkg, bkgname, 'tree','recreate') # Put LE sig into the same file array2root(dt_LE_out, signame, 'tree') # Save real data if added if dt_real != None: array2root(dt_real_out, dataname, 'tree','recreate')
def save_array(outputArray, outputName): #array = np.savetxt(outputName+".txt", outputArray, fmt='%.4e',delimiter = "|") outputString = str(outputName) logging.info("Creating .Root file") rnp.array2root(outputArray,outputString,treename='Training_Variables',mode='recreate')
# Convert to structured array ntuple_list = df_ntuple.as_matrix().tolist() ntuple_list = [tuple(a) for a in ntuple_list] name_type_ref = [('BDT', 'f4'), ('Class', 'i4'), ('EventNumber', 'i4'), ('EventWeight', 'f4'), ('MET', 'f4'), ('Mtop', 'f4'), ('dPhiLBmin', 'f4'), ('dPhiVBB', 'f4'), ('dRBB', 'f4'), ('dYWH', 'f4'), ('mBB', 'f4'), ('mBBJ', 'f4'), ('mTW', 'f4'), ('nTags', 'i4'), ('nJ', 'i4'), ('pTB1', 'f4'), ('pTB2', 'f4'), ('pTJ3', 'f4'), ('pTV', 'f4'), ('sample', 'S15')] ntuple_array = np.array(ntuple_list, dtype=name_type_ref) # Write to ROOT file. array2root(ntuple_array, '/Volumes/THUMB/VHbb-data/write/skl_BDT_results.root', mode='recreate')
t = f.NStations nentries = t.GetEntriesFast() cut = np.zeros(nentries, dtype='bool') for i in range(nentries): t.GetEntry(i) rootID = '%s_%s_%s' % (t.Run, t.Event, t.SubEvent) if rootID in eventID: cut[i] = True f.Close() ## WRITE TO FILE ## # Most likely composition try: values = np.zeros(nentries, dtype=[('comp', 'S1')]) values[cut] = d['llh_comp'][:] root_numpy.array2root(values, outFile, 'llh_comp', 'recreate') except ValueError: print 'Length mismatch. Skipping...' continue # Most likely energy values = np.zeros(nentries, dtype=[('energy', float)]) values[cut] = d['ML_energy'][:] root_numpy.array2root(values, outFile, 'ML_energy') # Likelihoods keys = ['pLLH', 'hLLH', 'oLLH', 'fLLH'] for key in keys: values = np.zeros(nentries, dtype=[('llh', float)]) values[cut] = d[key][:] root_numpy.array2root(values, outFile, key)
pool_files.append(extfile) nfiles_per_sample = None X_val = rootnp.root2array(extfile.path,'tree',variables,None,0,nfiles_per_sample,args.testEvery,False,'weight') X_val = rootnp.rec2array(X_val) BDTG = clf_val.predict_proba(X_val)[:,1] if (args.TMVAOut): BDTG = [i*2-1 for i in BDTG] Output_variables = ['flavour','vertexCategory','jetPt','jetEta'] Output_tree = rootnp.root2array(extfile.path,'tree',Output_variables,None,0,nfiles_per_sample,args.testEvery,False,'weight') Output_tree = rootnp.rec2array(Output_tree) Output_tree_final = np.ndarray((Output_tree.shape[0],),dtype=[('flavour', float), ('vertexCategory', float), ('jetPt', float), ('jetEta', float), ('BDTG', float)])#, buffer = np.array([1,2,3,4,5])) for idx,val in enumerate(BDTG): Output_tree_final[idx][0] = Output_tree[idx][0] Output_tree_final[idx][1] = Output_tree[idx][1] Output_tree_final[idx][2] = Output_tree[idx][2] Output_tree_final[idx][3] = Output_tree[idx][3] Output_tree_final[idx][4] = BDTG[idx] Output_tree_final = Output_tree_final.view(np.recarray) outname = 'trainPlusBDTG_CombinedSV%s_%s.root' % (category, flavor) if not args.batch else \ '%strainPlusBDTG_CombinedSV%s_%s.root' % (args.trainingTag, category, flavor) outfile = os.path.join(dirpath, outname) tree = rootnp.array2root(Output_tree_final, outfile, 'tree') with io.root_open(outfile, 'update') as tout: tout.WriteTObject(watermark, 'watermark') tout.WriteTObject(codemark, 'codemark') log.info('Output file dumped in %s' % outfile) log.info('done')
def analyse(bdt_models, bdt_taggers, dnn_models, dnn_taggers, dnn_scaler, data_files): # using bdt_model # bdt_model.predict_proba(data) # need to scale the data to use the dnn # for i, v in enumerate(scaler.variables): # data[v] = (data[v] - scaler.means[i]) / scaler.std[i] # This can be done on an event by event basis too # event.variable = event.variable - means[i]/ std[i] # using dnn_model # dnn_model.predict(data)[0] # The variables have different names in data, so we need to map the variable names to something the # models can use/ have the same names. bdt_var_map = {'Aplanarity':'jetTrimX_aplanarity', 'EEC_C2_1':'jetTrimX_c2beta1', 'EEC_D2_1':'jetTrimX_d2beta1', 'Sphericity':'jetTrimX_sphericity', 'SPLIT12': 'jetTrimX_groosplit12', 'TauWTA1':'jetTrimX_grootau1','TauWTA2':'jetTrimX_grootau2', 'TauWTA2TauWTA1':'jetTrimX_grootau21', 'Mu12':'jetTrimX_mufilt', 'yfilt': 'jetTrimX_ysfilt', 'y':'jetTrimX_y', 'nTracks':'jetTrimX_ungrngtrk', 'PlanarFlow': 'jetTrimX_planarflow'} # gotta get the order right for the variables bdt_vars_1 = [] bdt_vars_2 = [] for b in bdt_taggers: bdt_vars_1.append(bdt_var_map[b].replace('X','1')) bdt_vars_2.append(bdt_var_map[b].replace('X','2')) bdt_ivd = {} for k, v in bdt_var_map.items(): bdt_ivd[v.replace('X','1')] = k bdt_ivd[v.replace('X','2')] = k dnn_var_map = {'aplanarity':'jetTrimX_aplanarity', 'eec_c2_1':'jetTrimX_c2beta1','eec_d2_1':'jetTrimX_d2beta1', 'sphericity':'jetTrimX_sphericity', 'split12': 'jetTrimX_groosplit12', 'tauwta1':'jetTrimX_grootau1','tauwta2':'jetTrimX_grootau2', 'tauwta2tauwta1':'jetTrimX_grootau21', 'mu12':'jetTrimX_mufilt', 'yfilt': 'jetTrimX_ysfilt', 'y':'jetTrimX_y', 'ntracks':'jetTrimX_ungrngtrk', 'planarflow':'jetTrimX_planarflow'} # gotta get the order right for the variables dnn_vars_1 = [] dnn_vars_2 = [] print 'dnn_taggers' print dnn_taggers for b in dnn_taggers: dnn_vars_1.append(dnn_var_map[b].replace('X','1')) dnn_vars_2.append(dnn_var_map[b].replace('X','2')) dnn_ivd = {} for k, v in dnn_var_map.items(): dnn_ivd[v.replace('X','1')] = k dnn_ivd[v.replace('X','2')] = k # read in the data so we can analyse it! # gotta do it one at a time for each data file :( for data_file in data_files: # benchmark and try a few options # first read in data using root2array print data_file data_arr = rn.root2rec(data_file) # unfortunately, some of the values in the ntuples are NaNs!!! # keep track of which ones these are... nan_idx_tmp = np.empty([0]) no_values_tmp = np.empty([0]) for d in data_arr.dtype.names: if d not in dnn_vars_1 and d not in dnn_vars_2: continue if np.any(np.isnan(data_arr[d])) or not np.all(np.isfinite(data_arr[d])): if len(nan_idx_tmp) == 0: nan_idx_tmp = np.asarray(np.where(np.isnan(data_arr[d])))[0] else: nan_idx_tmp = np.concatenate((nan_idx_tmp, np.asarray(np.where(np.isnan(data_arr[d])))[0])) data_arr[d] = np.nan_to_num(data_arr[d]) if len(no_values_tmp) == 0: no_values_tmp = np.asarray(np.where(data_arr[d]<-98))[0] else: no_values_tmp = np.concatenate((no_values_tmp, np.asarray(np.where(data_arr[d]<-98))[0])) # concatenate nan_idx = np.unique(nan_idx_tmp) no_values = np.unique(no_values_tmp) data_arr['jetTrim1_groosplit12'] = data_arr['jetTrim1_groosplit12']/1000. data_arr['jetTrim2_groosplit12'] = data_arr['jetTrim2_groosplit12']/1000. # get the columns for classifying for jet1 bdt_data = data_arr[bdt_vars_1] dnn_data = data_arr[dnn_vars_1] # get the columns for classifying for jet2 bdt_data_2 = data_arr[bdt_vars_2] dnn_data_2 = data_arr[dnn_vars_2] # do we have to rename the data fields? # easy enough to rename if the fields recs = [] bdt_data.dtype.names = [bdt_ivd[d] for d in bdt_data.dtype.names] bdt_data_2.dtype.names = [bdt_ivd[d] for d in bdt_data_2.dtype.names] for d in bdt_data.dtype.names: if d != 'nTracks' and d.find('trk') == -1: recs.append((d, 'float')) else: recs.append((d, 'int')) #print np.any(np.isnan(bdt_data[d])) #print np.all(np.isfinite(bdt_data[d])) #print np.where(np.isnan(bdt_data[d])) bdt_data_arr = bdt_data.view(np.float32).reshape(bdt_data.shape + (-1,)) bdt_data_arr_2 = bdt_data_2.view(np.float32).reshape(bdt_data_2.shape + (-1,)) bdt_proba = None bdt_proba_2 = None for m in bdt_models: if bdt_proba is not None: bdt_proba += m.predict_proba(bdt_data_arr)[:,1] bdt_proba_2 += m.predict_proba(bdt_data_arr_2)[:,1] else: bdt_proba = m.predict_proba(bdt_data_arr)[:,1] bdt_proba_2 = m.predict_proba(bdt_data_arr_2)[:,1] # scale data bdt_proba/=float(len(bdt_models)) bdt_proba_2/=float(len(bdt_models)) for i, v in enumerate(dnn_scaler.variables): # reverse lookup for v #print v if v in dnn_var_map.keys(): #print 'found v in dnn_var_map' #print dnn_var_map[v] if dnn_var_map[v].replace('X','1') in dnn_data.dtype.names: ''' print v print 'means and std' print dnn_scaler.means[i] print dnn_scaler.std[i] print np.mean(dnn_data[dnn_var_map[v].replace('X','1')]) ''' dnn_data[dnn_var_map[v].replace('X','1')] = (dnn_data[dnn_var_map[v].replace('X','1')] - dnn_scaler.means[i]) / dnn_scaler.std[i] dnn_data_2[dnn_var_map[v].replace('X','2')] = (dnn_data_2[dnn_var_map[v].replace('X','2')] - dnn_scaler.means[i]) / dnn_scaler.std[i] dnn_data.dtype.names = [dnn_ivd[d] for d in dnn_data.dtype.names] dnn_data_2.dtype.names = [dnn_ivd[d] for d in dnn_data_2.dtype.names] # do we have to rename the data fields to get this to work? dnn_predictions = None for m in dnn_models: if dnn_predictions is not None: dnn_predict1 = m.predict(dnn_data)[0] dnn_predict1.dtype.names = ['jetTrim1_dnn'] dnn_predict2 = m.predict(dnn_data_2)[0] dnn_predict2.dtype.names = ['jetTrim2_dnn'] for n in xrange(len(dnn_predict1['jetTrim1_dnn'])): dnn_predictions['jetTrim1_dnn'][n] += dnn_predict1['jetTrim1_dnn'][n] dnn_predictions_2['jetTrim2_dnn'][n] += dnn_predict2['jetTrim2_dnn'][n] else: dnn_predictions = m.predict(dnn_data)[0] dnn_predictions.dtype.names = ['jetTrim1_dnn'] dnn_predictions_2 = m.predict(dnn_data_2)[0] dnn_predictions_2.dtype.names = ['jetTrim2_dnn'] for n in xrange(len(dnn_predictions['jetTrim1_dnn'])): dnn_predictions['jetTrim1_dnn'][n] /= float(len(dnn_models)) dnn_predictions_2['jetTrim2_dnn'][n] /= float(len(dnn_models)) # set all of the nan index ones to zero for n in np.unique(np.concatenate((no_values, nan_idx))):#nan_idx: dnn_predictions['jetTrim1_dnn'][n] = 0 dnn_predictions_2['jetTrim2_dnn'][n] = 0 bdt_proba[n] = 0 bdt_proba_2[n] = 0 data_arr['jetTrim1_groosplit12'] = data_arr['jetTrim1_groosplit12']*1000. data_arr['jetTrim2_groosplit12']*=1000. # have to do this annoying .copy() to be able to add the dtype.names to any # arrays that come from a slice. #bdt_ = X.copy().view(dtype=[(n, np.float64) for n in variables]).reshape(len(X)) # now add them to the data file and write it out #data_scored = nf.append_fields(data_arr, names=['jetTrim1_bdt','jetTrim2_bdt','jetTrim1_dnn','jetTrim2_dnn'], data=[bdt_proba, bdt_proba_2, dnn_predictions, dnn_predictions_2], usemask = False) data_scored = nf.append_fields(data_arr, names=['jetTrim1_bdt','jetTrim2_bdt','jetTrim1_dnn','jetTrim2_dnn'], data=[bdt_proba, bdt_proba_2, dnn_predictions, dnn_predictions_2], usemask = False) rn.array2root(data_scored, data_file.replace('.root','_scored_nonTrk_avg_vTest.root'),'dibjet','recreate')
def table_to_root(table, filename, **kwargs): """Write a Table to a ROOT file """ import root_numpy root_numpy.array2root(table.as_array(), filename, **kwargs)