def saveClassAttributes(clss, form, save_addr): """ Save class attributes. **Parameters**\n clss: instance Handle of the instance to be saved. form: str Format to save in ('h5'/'hdf5', 'mat', or 'dmp'/'dump'). save_addr: str The address to save the attributes in. """ save_addr = u.appendformat(save_addr, form) if form == 'mat': sio.savemat(save_addr, clss.__dict__) elif form in ('h5', 'hdf5'): try: dictdump.dicttoh5(clss.__dict__, save_addr) except: dio.save(save_addr, clss.__dict__, compression=None) elif form in ('dmp', 'dump'): fh = open(save_addr, 'wb') pickle.dump(clss, fh) fh.close() else: raise NotImplementedError
def saveClassAttributes(clss, form, save_addr): """ Save class attributes. :Parameters: clss : instance Handle of the instance to be saved. form : str Format to save in ('h5' or 'mat'). save_addr : str The address to save the attributes in. """ save_addr = u.appendformat(save_addr, form) if form == 'mat': sio.savemat(save_addr, clss.__dict__) elif form in ('h5', 'hdf5'): try: dictdump.dicttoh5(clss.__dict__, save_addr) except: dio.save(save_addr, clss.__dict__, compression=None) else: raise NotImplementedError
def test(net, data): ''' Args: ----- net: a trained keras Model instance data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.: data = { "X_jet_train" : X_jet_train, "X_jet_test" : X_jet_test, "X_photon_train" : X_photon_train, "X_photon_test" : X_photon_test, "y_train" : y_train, "y_test" : y_test, "w_train" : w_train, "w_test" : w_test } Returns: -------- yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data ''' yhat = net.predict([data['X_jet_test'], data['X_photon_test'], data['X_event_test']], verbose = True, batch_size = 512) io.save(open(os.path.join('output','yhat.h5'), 'wb'), yhat) # -- example of other quantities that can be evaluated from yhat #class_predictions = [np.argmax(ev) for ev in yhat]) return yhat
def save_pde_sol(self, fname, sol, t=None): b, w = self.split_sol(sol) data = {} data['b'] = b data['w'] = w data['Ps'] = self.p data['Es'] = self.setup if t is not None: data['t'] = t dd.save(fname + '.hdf5', data, compression='blosc')
def save(self, path): """ Saves an instance of the class using :func:`deepdish.io.save`. Parameters ---------- path : str Output path to HDF5 file. """ io.save(path, self.save_to_dict())
def save(self, path): """ Saves an instance of the class using `deepdish.io.save`. Parameters ---------- path : str Output path to HDF5 file. """ io.save(path, self.save_to_dict())
def saveParmSet(fname, par, text=None, saveroot=False): from deepdish.io import save if saveroot: from tlmModel import tlmModel, Es_normal m = tlmModel(Es=Es_normal, Ps=par, Vs=None) t, result = m.ode_integrate([0.9, 0.2, 0.2]) b, w, h = result.T[-1] par['b'] = b par['w'] = w par['h'] = h if text is not None: par['text'] = text save("./auto/" + fname + ".hdf5", par)
def test(): ''' ''' data = io.load(open('test_data.h5', 'rb')) #data = remove_tau(data) # -- Load scikit classifier classifier = joblib.load('sklBDT_trk2.pkl') # -- Get classifier predictions yhat = classifier.predict_proba(data['X'])[:, 2] io.save(open('yhat_test.h5', 'wb'), yhat)
def saveParmSet(fname, par, text=None, saveroot=False): from deepdish.io import save if saveroot: from tlmModel import tlmModel, Es_normal Es_normal['verbose'] = False m = tlmModel(Es=Es_normal, Ps=par, Vs=None) t, result = m.ode_integrate([0.9, 0.3, 0.3]) b, s1, s2 = result[-1] par['b'] = b par['s1'] = s1 par['s2'] = s2 print b, s1, s2 if text is not None: par['text'] = text save("./auto/" + fname + ".hdf5", par)
def pinv(x): # This could be a numpy issue, since the same matrix works fine in Matlab. # https://github.com/numpy/numpy/issues/1588 try: res = np.linalg.pinv(x) except np.linalg.LinAlgError as err: n_retry = 0 while True: try: res = np.linalg.pinv(x + np.random.randn(*x.shape) * 1e-15) break except np.linalg.LinAlgError: n_retry += 1 if n_retry == 3: from deepdish import io as dio dio.save('debug.hdf', dict(x=x)) raise err return res
def main(): opts = parse_options() inFile = opts.inputFile tree = opts.treeName df = root2pandas(inFile, tree) # -- save a pandas df to hdf5 (better to first convert it back to ndarray, to be fair) import deepdish.io as io outFile = inFile.replace(".root", ".h5") io.save(outFile, df) # -- let's load it back in to make sure it actually worked! new_df = io.load(outFile) # -- check the shape again -- nice check to run every time you create a df print "File check!" print "(Number of events, Number of branches): ", new_df.shape
def main(MODEL_FILE): print "Loading hdf5's..." test_dict = io.load('./data/test_dict_IPConv_ntuple_'+ RUN_NAME +'.h5') train_dict = io.load('./data/train_dict_IPConv_ntuple_'+ RUN_NAME +'.h5') X_train = train_dict['X'] y_train = train_dict['y'] X_test = test_dict['X'] y_test = test_dict['y'] n_features = X_test.shape[2] # this is a df ip3d = test_dict['ip3d'] print 'Building model...' if (MODEL_FILE == 'CRNN'): graph = build_graph(n_features) model = Sequential() model.add(graph) # remove Maxout for tensorflow model.add(MaxoutDense(64, 5, input_shape=graph.nodes['dropout'].output_shape[1:])) model.add(Dense(64)) elif (MODEL_FILE == 'RNN'): model = Sequential() model.add(Masking(mask_value=-999, input_shape=(N_TRACKS, n_features))) model.add(GRU(25))#, input_shape=(N_TRACKS, n_features))) #GRU model.add(Dropout(0.2)) #0.2 # remove Maxout for tensorflow model.add(MaxoutDense(64, 5)) #, input_shape=graph.nodes['dropout'].output_shape[1:])) model.add(Dense(64)) model.add(Dropout(0.4)) model.add(Highway(activation = 'relu')) model.add(Dropout(0.3)) model.add(Dense(4)) model.add(Activation('softmax')) print 'Compiling model...' model.compile('adam', 'categorical_crossentropy') model.summary() print 'Training:' try: model.fit(X_train, y_train, batch_size=512, callbacks = [ EarlyStopping(verbose=True, patience=20, monitor='val_loss'), ModelCheckpoint(MODEL_FILE + RUN_NAME +'-progress', monitor='val_loss', verbose=True, save_best_only=True) ], nb_epoch=100, validation_split = 0.2, show_accuracy=True) except KeyboardInterrupt: print 'Training ended early.' # -- load in best network model.load_weights(MODEL_FILE + RUN_NAME +'-progress') if (SAVE_PROTOBUF): print 'Saving protobuf' # write out to a new directory called models # the actual graph file is graph.pb # the graph def is in the global session import tensorflow as tf import keras.backend.tensorflow_backend as tfbe sess = tfbe._SESSION saver = tf.train.Saver() tf.train.write_graph(sess.graph_def, 'models/', 'graph.pb', as_text=False) save_path = saver.save(sess, "./model-weights.ckpt") print "Model saved in file: %s" % save_path print saver.as_saver_def().filename_tensor_name print saver.as_saver_def().restore_op_name print model.get_output() print 'Saving weights...' model.save_weights('./weights/ip3d-replacement_' + MODEL_FILE + RUN_NAME +'.h5', overwrite=True) json_string = model.to_json() open(MODEL_FILE + RUN_NAME +'.json', 'w').write(json_string) print 'Testing...' yhat = model.predict(X_test, verbose = True, batch_size = 512) io.save('yhat'+ RUN_NAME +'.h5', yhat) print 'Plotting ROC...' fg = plot_ROC(y_test, yhat, ip3d, MODEL_FILE) #plt.show() fg.savefig('./plots/roc' + MODEL_FILE + RUN_NAME +'.pdf')
def save_hdf_from_nc(fname): from deepdish.io import save data=conv_nc_to_dict(fname) if fname.endswith('.nc'): fname=fname[:-3] save(fname+'.hdf5',data)
def makeCombi(inputDir, inputFile, outputDir, makeTrainingInput=False, sys=''): print(str(inputDir+"/"+inputFile)+" start") chain = TChain("ttbbLepJets/tree") chain.Add(inputDir+"/"+inputFile) data = False if 'Data' in inputDir: data = True ttbb = False if 'ttbb' in inputDir: ttbb = True if makeTrainingInput: ttbb = True muon_ch = 0 muon_pt = 30.0 muon_eta = 2.1 electron_ch = 1 electron_pt = 35.0 electron_eta = 2.1 jet_pt = 30.0 jet_eta = 2.4 jet_CSV_tight = 0.9535 jetCombi = [] for i in xrange(chain.GetEntries()) : chain.GetEntry(i) lepton_SF = 1.0 jet_SF_CSV = 1.0 pdfweight = [] scaleweight = [] PUWeight = [] lepton_SF = [] jet_SF_CSV_30 = [] if not data: for j in xrange((chain.lepton_SF).size()): lepton_SF.append(float(chain.lepton_SF[j])) for j in xrange((chain.jet_SF_CSV_30).size()): jet_SF_CSV_30.append(float(chain.jet_SF_CSV_30[j])) for j in xrange((chain.PUWeight).size()): PUWeight.append(float(chain.PUWeight[j])) if 'TT' in inputDir or 'tt' in inputDir: for j in xrange((chain.scaleweight).size()): scaleweight.append(float(chain.scaleweight[j])) for j in xrange((chain.pdfweight).size()): pdfweight.append(float(chain.pdfweight[j])) MET_px = chain.MET*math.cos(chain.MET_phi) MET_py = chain.MET*math.sin(chain.MET_phi) nu = TLorentzVector(MET_px, MET_py, 0, chain.MET) lep = TLorentzVector() lep.SetPtEtaPhiE(chain.lepton_pT, chain.lepton_eta, chain.lepton_phi, chain.lepton_E) passmuon = False passelectron = False passmuon = chain.channel == muon_ch and lep.Pt() > muon_pt and abs(lep.Eta()) < muon_eta passelectron = chain.channel == electron_ch and lep.Pt() > electron_pt and abs(lep.Eta()) < electron_eta if passmuon == False and passelectron == False: continue addbjet1 = TLorentzVector(0,0,0,0) addbjet2 = TLorentzVector(0,0,0,0) if ttbb: addbjet1.SetPtEtaPhiE(chain.addbjet1_pt,chain.addbjet1_eta,chain.addbjet1_phi,chain.addbjet1_e) addbjet2.SetPtEtaPhiE(chain.addbjet2_pt,chain.addbjet2_eta,chain.addbjet2_phi,chain.addbjet2_e) njets = 0 nbjets = 0 addbjet1_matched = TLorentzVector(0,0,0,0) addbjet2_matched = TLorentzVector(0,0,0,0) for iJet in range(len(chain.jet_pT)): jet = TLorentzVector() jet.SetPtEtaPhiE(chain.jet_pT[iJet],chain.jet_eta[iJet],chain.jet_phi[iJet],chain.jet_E[iJet]) if not data : if 'jecup' in sys: jet *= chain.jet_JER_Nom[iJet] * chain.jet_JES_Up[iJet] elif 'jecdown' in sys: jet *= chain.jet_JER_Nom[iJet] * chain.jet_JES_Down[iJet] elif 'jerup' in sys: jet *= chain.jet_JER_Up[iJet] elif 'jerdown' in sys: jet *= chain.jet_JER_Down[iJet] else: jet *= chain.jet_JER_Nom[iJet] if jet.Pt() < jet_pt or abs(jet.Eta()) > jet_eta: continue njets += 1 if chain.jet_CSV[iJet] > jet_CSV_tight: nbjets += 1 if addbjet1.DeltaR(jet) < 0.4: addbjet1_matched = jet; if addbjet2.DeltaR(jet) < 0.4: addbjet2_matched = jet; if njets < 6 or nbjets < 3: continue print("addbjet1: "+str(addbjet1.Pt())+" matched: "+str(addbjet1_matched.Pt())) print("addbjet2: "+str(addbjet2.Pt())+" matched: "+str(addbjet2_matched.Pt())) for j in range(len(chain.jet_pT)-1): for k in range(j+1, len(chain.jet_pT)): if chain.jet_CSV[j] > jet_CSV_tight and chain.jet_CSV[k] > jet_CSV_tight: b1 = TLorentzVector() b2 = TLorentzVector() b1.SetPtEtaPhiE(chain.jet_pT[j], chain.jet_eta[j], chain.jet_phi[j], chain.jet_E[j]) b2.SetPtEtaPhiE(chain.jet_pT[k], chain.jet_eta[k], chain.jet_phi[k], chain.jet_E[k]) if not data : if 'jecup' in sys: b1 *= chain.jet_JER_Nom[j] * chain.jet_JES_Up[j] b2 *= chain.jet_JER_Nom[k] * chain.jet_JES_Up[k] elif 'jecdown' in sys: b1 *= chain.jet_JER_Nom[j] * chain.jet_JES_Down[j] b2 *= chain.jet_JER_Nom[k] * chain.jet_JES_Down[k] elif 'jerup' in sys: b1 *= chain.jet_JER_Up[j] b2 *= chain.jet_JER_Up[k] elif 'jerdown' in sys: b1 *= chain.jet_JER_Down[j] b2 *= chain.jet_JER_Down[k] else : b1 *= chain.jet_JER_Nom[j] b2 *= chain.jet_JER_Nom[k] if makeTrainingInput: if (addbjet1_matched.DeltaR(b1) == 0 and addbjet2_matched.DeltaR(b2) == 0) or (addbjet2_matched.DeltaR(b1) == 0 and addbjet1_matched.DeltaR(b2) == 0): signal = 1 else: signal = 0 jetCombi.append([ signal,i,b1.DeltaR(b2),abs(b1.Eta()-b2.Eta()),b1.DeltaPhi(b2), (b1+b2+nu).Pt(),(b1+b2+nu).Eta(),(b1+b2+nu).Phi(),(b1+b2+nu).M(), (b1+b2+lep).Pt(),(b1+b2+lep).Eta(),(b1+b2+lep).Phi(),(b1+b2+lep).M(), (b1+lep).Pt(),(b1+lep).Eta(),(b1+lep).Phi(),(b1+lep).M(), (b2+lep).Pt(),(b2+lep).Eta(),(b2+lep).Phi(),(b2+lep).M(), (b1+b2).Pt(),(b1+b2).Eta(),(b1+b2).Phi(),(b1+b2).M(), chain.jet_CSV[j],chain.jet_CSV[k], b1.Pt(),b2.Pt(),b1.Eta(),b2.Eta(),b1.Phi(),b2.Phi(),b1.E(),b2.E() ]) else: jetCombi.append([ #Tree info i, chain.channel, njets, nbjets, chain.genweight, PUWeight, lepton_SF, jet_SF_CSV_30, scaleweight, pdfweight, lep.Pt(), lep.Eta(), lep.Phi(), lep.E(), addbjet1.Pt(), addbjet1.Eta(), addbjet1.Phi(), addbjet1.E(), addbjet2.Pt(), addbjet2.Eta(), addbjet2.Phi(), addbjet2.E(), j,k, #Deep learning variables b1.DeltaR(b2),abs(b1.Eta()-b2.Eta()),b1.DeltaPhi(b2), (b1+b2+nu).Pt(),(b1+b2+nu).Eta(),(b1+b2+nu).Phi(),(b1+b2+nu).M(), (b1+b2+lep).Pt(),(b1+b2+lep).Eta(),(b1+b2+lep).Phi(),(b1+b2+lep).M(), (b1+lep).Pt(),(b1+lep).Eta(),(b1+lep).Phi(),(b1+lep).M(), (b2+lep).Pt(),(b2+lep).Eta(),(b2+lep).Phi(),(b2+lep).M(), (b1+b2).Pt(),(b1+b2).Eta(),(b1+b2).Phi(),(b1+b2).M(), chain.jet_CSV[j],chain.jet_CSV[k], b1.Pt(),b2.Pt(),b1.Eta(),b2.Eta(),b1.Phi(),b2.Phi(),b1.E(),b2.E() ]) if makeTrainingInput: combi = pd.DataFrame(jetCombi, columns=['signal', 'event']+ut.getVarlist()) else: combi = pd.DataFrame(jetCombi, columns= ['event','channel','njets','nbjets', 'genWeight','PUWeight', 'lepton_SF','jet_SF_CSV_30', 'scaleweight', 'pdfweight', 'leptonPt','leptonEta','leptonPhi','leptonE', 'addbjet1_pt','addbjet1_eta','addbjet1_phi','addbjet1_e', 'addbjet2_pt','addbjet2_eta','addbjet2_phi','addbjet2_e', 'b1','b2', ] + ut.getVarlist()) tmp = inputFile[:-5] if makeTrainingInput: io.save(outputDir+"/array_train_ttbb.h5",combi) else: io.save(outputDir+"/array_"+tmp+".h5",combi) print(str(inputDir+"/"+inputFile)+" end")
def _fit_and_score_ckpt(workdir=None, checkpoint=True, force_refresh=False, **fit_and_score_kwargs): """Fit estimator and compute scores for a given dataset split. This function wraps :func:`sklearn:sklearn.model_selection._validation._fit_and_score`, while also saving checkpoint files containing the estimator, paramters, This is useful if fitting and scoring is costly or if it is being performed within a large cross-validation experiment. In avoid collisions with scores computed for other CV splits, this function computes a hash from a nested dictionary containing all keyword arguments as well as estimator parameters. It then saves the scores and parameters in <hash>_params.h5 and the estimator itself in <hash>_estimator.pkl Parameters ---------- workdir : path-like object, default=None A string or :term:`python:path-like-object` indicating the directory in which to store checkpoint files checkpoint : bool, default=True If True, checkpoint the parameters, estimators, and scores. force_refresh : bool, default=False If True, recompute scores even if the checkpoint file already exists. Otherwise, load scores from checkpoint files and return. **fit_and_score_kwargs : kwargs Key-word arguments passed to :func:`sklearn:sklearn.model_selection._validation._fit_and_score` Returns ------- train_scores : dict of scorer name -> float Score on training set (for all the scorers), returned only if `return_train_score` is `True`. test_scores : dict of scorer name -> float Score on testing set (for all the scorers). n_test_samples : int Number of test samples. fit_time : float Time spent for fitting in seconds. score_time : float Time spent for scoring in seconds. parameters : dict or None The parameters that have been evaluated. estimator : estimator object The fitted estimator """ if not checkpoint: return _fit_and_score(**fit_and_score_kwargs) if workdir is None: raise ValueError( "If checkpoint is True, you must supply a working directory " "through the ``workdir`` argument.") estimator = fit_and_score_kwargs.pop("estimator", None) estimator_params = _serialize_estimator_params(estimator.get_params()) all_params = { "estimator_params": estimator_params, "fit_and_score_kwargs": fit_and_score_kwargs, } cv_hash = hashlib.md5( json.dumps(all_params, sort_keys=True, ensure_ascii=True, default=str).encode()).hexdigest() h5_file = os.path.join(workdir, cv_hash + "_params.h5") pkl_file = os.path.join(workdir, cv_hash + "_estimator.pkl") if not force_refresh and os.path.exists(h5_file): ckpt_dict = ddio.load(h5_file) scores = ckpt_dict["scores"] if fit_and_score_kwargs.get("return_estimator", False): with open(pkl_file, "rb") as fp: estimator = pickle.load(fp) scores.append(estimator) return scores else: scores = _fit_and_score(estimator, **fit_and_score_kwargs) os.makedirs(workdir, exist_ok=True) if fit_and_score_kwargs.get("return_estimator", False): estimator = scores[-1] with open(pkl_file, "wb") as fp: pickle.dump(estimator, fp) ckpt_scores = scores[:-1] if isinstance(estimator, Pipeline): model = estimator.steps[-1] else: model = estimator estimator_params = _serialize_estimator_params( estimator.get_params()) fitted_params = { "alpha_": getattr(model, "alpha_", None), "alphas_": getattr(model, "alphas_", None), "l1_ratio_": getattr(model, "l1_ratio_", None), "mse_path_": getattr(model, "mse_path_", None), "scoring_path_": getattr(model, "scoring_path_", None), "intercept_": getattr(model, "intercept_", None), "coef_": getattr(model, "coef_", None), } else: estimator_params = None fitted_params = None ckpt_scores = scores fit_and_score_kwargs.pop("X") fit_and_score_kwargs.pop("y") if "scorer" in fit_and_score_kwargs: fit_and_score_kwargs["scorer"] = list( fit_and_score_kwargs["scorer"].keys()) ckpt_dict = { "scores": ckpt_scores, "fit_and_score_kwargs": fit_and_score_kwargs, "estimator_params": estimator_params, "fitted_params": fitted_params, } ddio.save(h5_file, ckpt_dict) return scores
def main(embed_size, normed, input_id, run_name): configure_logging() logger = logging.getLogger("RNNIP Training") logger.info("Loading hdf5's") test_dict = io.load(os.path.join('data', 'test_dict_' + input_id + '.h5')) train_dict = io.load(os.path.join('data', 'train_dict_' + input_id + '.h5')) X_train_stream0 = train_dict['grade'] X_train_stream1 = train_dict['X'] y_train = train_dict['y'] X_test_stream0 = test_dict['grade'] X_test_stream1 = test_dict['X'] y_test = test_dict['y'] ip3d = test_dict['ip3d'] logger.info('Building model') model = build_model(X_train_stream0, X_train_stream1, embed_size, normed) model.summary() logger.info('Compiling model') model.compile('adam', 'categorical_crossentropy', metrics=['accuracy']) #-- if the pre-trained model exists, load it in, otherwise start from scratch safe_mkdir('weights') weights_file = os.path.join('weights', 'rnnip_' + run_name + '.h5') try: model.load_weights(weights_file) logger.info('Loaded pre-trained model from ' + weights_file) except IOError: logger.info('No pre-trained model found in ' + weights_file) logger.info('Training:') try: model.fit([X_train_stream0, X_train_stream1], y_train, batch_size=512, callbacks=[ EarlyStopping(verbose=True, patience=20, monitor='val_loss'), ModelCheckpoint(weights_file, monitor='val_loss', verbose=True, save_best_only=True) ], epochs=300, validation_split=0.2) except KeyboardInterrupt: logger.info('Training ended early.') # -- load in best network logger.info('Loading best epoch') model.load_weights(weights_file) json_string = model.to_json() safe_mkdir('json_models') open(os.path.join('json_models', run_name + '.json'), 'w').write(json_string) logger.info('Testing') safe_mkdir('predictions') yhat = model.predict([X_test_stream0, X_test_stream1], verbose=True, batch_size=10000) io.save(os.path.join('predictions', 'yhat' + run_name + '.h5'), yhat) logger.info('Plotting ROC') plot_ROC(y_test, yhat, ip3d, run_name)
#'jet_trk_dPhi'] # more to be added in `process_data` 'jet_trk_phi'] # more to be added in `process_data` cut_vars = ['jet_eta', 'jet_pt', 'jet_JVT', 'jet_aliveAfterOR'] # only necessary to remove bad jets # -- load and process training set print 'Loading training dataframe...' trk_train = pup.root2panda( './data/Dan/NOtrkSel/train_NOtrkSel.root', 'bTag_AntiKt4EMTopoJets', branches = track_inputs + cut_vars + ['jet_LabDr_HadF' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta'] ) print 'Processing training sample ...' train_dict = process_data(trk_train, cut_vars, savevars=True) del trk_train io.save('./data/train_dict_IPConv_ntuple_MyTrkSel.h5', train_dict) # -- load and process test set print 'Loading test dataframe...' trk_test = pup.root2panda( './data/Dan/NOtrkSel/test/user.dguest.8493098.Akt4EMTo._000013_NOtrkSel.root', 'bTag_AntiKt4EMTopoJets', branches = track_inputs + cut_vars + ['jet_LabDr_HadF' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta'] ) print 'Processing test sample...' test_dict = process_data(trk_test, cut_vars, savevars=False) del trk_test io.save('./data/test_dict_IPConv_ntuple_MyTrkSel.h5', test_dict)
help="Maximum number of tracks per event. \ If the event has fewer tracks, use padding; if is has more, only consider the first ntrk" ) args = parser.parse_args() print 'Loading dataframes...' # -- currently only training and testing on one file each! trk_train = pup.root2panda(os.path.join('data', 'train', args.train_files), 'bTag_AntiKt4EMTopoJets', branches=track_inputs + jet_inputs) trk_test = pup.root2panda(os.path.join('data', 'test', args.test_files), 'bTag_AntiKt4EMTopoJets', branches=track_inputs + jet_inputs) print 'Processing training sample ...' train_dict = process_data(trk_train, jet_inputs, args.ntrk, args.sort_by, args.output, savevars=True) del trk_train io.save(os.path.join('data', 'train_dict_' + args.output + '.h5'), train_dict) print 'Processing test sample...' test_dict = process_data(trk_test, jet_inputs, args.ntrk, args.sort_by, args.output) del trk_test io.save(os.path.join('data', 'test_dict_' + args.output + '.h5'), test_dict)
def process(i, filepath, yaml_file): ''' ''' import pandautils as pup branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features( yaml_file) logger = logging.getLogger("ETL Service") logger.info('Operating on {}'.format(filepath)) logger.info('Creating dataframe...') df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches) logger.info('Transforming variables...') df = transformVars(df) logger.info('Flattening df...') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()}) del df logger.info('Applying cuts...') df_flat = apply_calojet_cuts(df_flat) logger.info('Creating X, y, w, mv2c10...') y = df_flat['jet_LabDr_HadF'].values mv2c10 = df_flat['jet_mv2c10'].values jet_pt = df_flat['jet_pt'].values ip3d_vars = df_flat[ip3d_training_vars].values ipmp_vars = df_flat[ipmp_training_vars].values # -- slice df by only keeping the training variables X = df_flat[training_vars].values # -- Find weights by reweighting to the light distribution # -- TO DO: pass the pt and eta columns directly, instead of passing their indices pt_col = np.argwhere(np.array(training_vars) == 'jet_pt')[0][0] eta_col = np.argwhere(np.array(training_vars) == 'abs(jet_eta)')[0][0] #w = reweight_to_b(X, y, pt_col, eta_col) w = reweight_to_l(X, y, pt_col, eta_col) del df_flat logger.info('Shuffling, splitting, scaling...') ix = np.array(range(len(y))) X_train, X_test, y_train, y_test, w_train, w_test, ix_train, ix_test, \ mv2c10_train, mv2c10_test, jet_pt_train, jet_pt_test, \ ip3d_vars_train, ip3d_vars_test, ipmp_vars_train, ipmp_vars_test = train_test_split( X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6) scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ip3d_vars_train = scaler.fit_transform(ip3d_vars_train) ip3d_vars_test = scaler.transform(ip3d_vars_test) ipmp_vars_train = scaler.fit_transform(ipmp_vars_train) ipmp_vars_test = scaler.transform(ipmp_vars_test) X_train, X_validate, y_train, y_validate, w_train, w_validate, ix_train, ix_validate, \ mv2c10_train, mv2c10_validate, jet_pt_train, jet_pt_validate, \ ip3d_vars_train, ip3d_vars_validate, ipmp_vars_train, ipmp_vars_validate = train_test_split( X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7) train = { 'X': X_train, 'ip3d_vars': ip3d_vars_train, 'ipmp_vars': ipmp_vars_train, 'y': y_train, 'w': w_train, 'ix': ix_train, 'mv2c10': mv2c10_train, 'pt': jet_pt_train } test = { 'X': X_test, 'ip3d_vars': ip3d_vars_test, 'ipmp_vars': ipmp_vars_test, 'y': y_test, 'w': w_test, 'ix': ix_test, 'mv2c10': mv2c10_test, 'pt': jet_pt_test } validate = { 'X': X_validate, 'ip3d_vars': ip3d_vars_validate, 'ipmp_vars': ipmp_vars_validate, 'y': y_validate, 'w': w_validate, 'ix': ix_validate, 'mv2c10': mv2c10_validate, 'pt': jet_pt_validate } logger.info('Saving dictionaries to hdf5...') hdf5_train_path = os.path.join('..', 'data', 'DL1-' + OUTNAME + str(i) + '-train-db.h5') hdf5_test_path = os.path.join('..', 'data', 'DL1-' + OUTNAME + str(i) + '-test-db.h5') hdf5_validate_path = os.path.join( '..', 'data', 'DL1-' + OUTNAME + str(i) + '-validate-db.h5') io.save(hdf5_train_path, train) io.save(hdf5_test_path, test) io.save(hdf5_validate_path, validate) logger.debug('Saved hdf5 archives: {}, {}, {}'.format( hdf5_train_path, hdf5_test_path, hdf5_validate_path)) return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
def save(self, fname): with warnings.catch_warnings(): warnings.simplefilter('ignore', category=tables.NaturalNameWarning) dio.save(fname, self.to_dict())
def integrate(prec_i,chi,beta, Ps=defaultPs, n=(512,512),l=(256.0,256.0), Vs_initial="uniform",rhs="oz_EQK", bc="periodic",it="pseudo_spectral", first_time = 1000.0,tol=1.0e-8,add_noise=0.01, fname="cont",verbose=True, create_movie=False, send_email=None, test=False): import deepdish.io as dd if send_email is not None: import getpass pswd = getpass.getpass('Password:'******'rhs':rhs,'n':n,'l':l,'bc':bc,'it':it, 'dt':0.1,'verbose':verbose,'analyze':False,'setPDE':True} if Vs_initial.endswith(".dat",-4): state = np.loadtxt(Vs_initial) m = bwhModel(Es=Es,Ps=Ps,Vs=state) fname = str(Vs_initial[:-4]) else: m = bwhModel(Es=Es,Ps=Ps,Vs=None) if Vs_initial=="uniform": m.setup_initial_condition("uniform",p=prec_i,chi=chi,beta=beta) else: m.setup_initial_condition(Vs_initial) m.setup['verbose']=verbose yr=m.p['conv_T_to_t'] if test: print("Test session:") print(m.p) print("Dimpar:",m.p['dimpar']) print("chi=",chi,"beta=",beta) print("fname:",fname,", fname type:",type(fname)) # sol = Vs_initial if not test: print("****Starting Integration****") sol = m.integrate(m.initial_state,check_convergence=True, max_time=first_time*yr,p=prec_i,chi=chi,beta=beta) print("****Integration Finished****") elif test: sol = m.initial_state b,w,h = m.split_state(sol) dd.save(fname+".hdf5",{'p':prec_i,'chi':float(chi),'beta':float(beta), 'Ps_dimensional':m.p['dimpar'], 'n':n,'l':l,'state':sol,'test':test, 'b':b,'w':w,'h':h}) if send_email is not None: try: import smtplib from socket import gaierror server = smtplib.SMTP('smtp.gmail.com', 587) server.ehlo() server.starttls() server.login(send_email, pswd) msg = "\r\n".join([ "From: {}".format(send_email), "To: {}".format(send_email), "Subject: Drought bwh simulations finished", "", "Drought bwh simulations finished and saved to:", fname ]) server.sendmail(send_email, send_email, msg) server.quit() except gaierror: pass
def main(file_name, run_name, n_tracks): print "Loading hdf5's from ./data/test_dict" + file_name + ".h5 and ./data/train_dict" + file_name + ".h5" test_dict = io.load(os.path.join('data', 'test_dict_' + file_name + '.h5')) train_dict = io.load(os.path.join('data', 'train_dict_' + file_name + '.h5')) X_train = train_dict['X'] y_train = train_dict['y'] X_test = test_dict['X'] y_test = test_dict['y'] n_features = X_test.shape[2] # this is a df ip3d = test_dict['ip3d'] print 'Building model...' # -- for track grade as a normal input: model = Sequential() model.add(Masking(mask_value=-999, input_shape=(n_tracks, n_features))) model.add(Dropout(0.2)) # dropping out before the GRU should help us reduce dependence on any specific input variable # ^^ could be desirable when training on track hits in case one detector layer fails model.add(GRU(25, return_sequences=False)) # model.add(Dropout(0.2)) model.add(Dense(4)) model.add(Activation('softmax')) model.summary() print 'Compiling model...' model.compile('adam', 'categorical_crossentropy', metrics=["accuracy"]) # -- if the pre-trained model exists, load it in, otherwise start from scratch try: _weights_location = os.path.join('weights', 'ip3d-replacement_' + MODEL_FILE + '_' + run_name +'.h5') model.load_weights(_weights_location) print 'Loaded pre-trained model from ' + _weights_location except IOError: print 'No pre-trained model found in ' + _weights_location print 'Training:' try: model.fit(X_train, y_train, batch_size=1024, callbacks = [ EarlyStopping(verbose=True, patience=20, monitor='val_loss'), ModelCheckpoint(MODEL_FILE + run_name +'-progress', monitor='val_loss', verbose=True, save_best_only=True) ], nb_epoch=300, validation_split = 0.2) except KeyboardInterrupt: print 'Training ended early.' # -- load in best network model.load_weights(MODEL_FILE + run_name +'-progress') print 'Saving weights in ' + _weights_location model.save_weights(_weights_location, overwrite=True) json_string = model.to_json() open(MODEL_FILE + run_name +'.json', 'w').write(json_string) print 'Testing...' yhat = model.predict(X_test, verbose = True, batch_size = 1024) io.save('yhat'+ run_name +'.h5', yhat) print 'Plotting ROC...' fg_bl, fg_bc = plot_ROC(y_test, yhat, ip3d, run_name, MODEL_FILE)
def simdrought(prec_i,prec_f,delta_p,delta_year,chi,beta, Ps=defaultPs, n=(512,512),l=(256.0,256.0), Vs_initial="uniform",rhs="oz_EQK", bc="periodic",it="pseudo_spectral", first_time = 100.0,tol=1.0e-8,add_noise=0.01, fname="cont",verbose=True, savefile=None,create_movie=False, send_email=None): import deepdish.io as dd if send_email is not None: import getpass pswd = getpass.getpass('Password:'******'rhs':rhs,'n':n,'l':l,'bc':bc,'it':it, 'dt':0.1,'verbose':verbose,'analyze':False,'setPDE':True} if type(Vs_initial)==str: fname = fname+"_"+Vs_initial prec_gradient_down = np.arange(prec_i,prec_f-delta_p,-delta_p) time_span = np.arange(delta_year,len(prec_gradient_down)*delta_year+delta_year,delta_year) m = bwhModel(Vs=None,Es=Es,Ps=Ps) if Vs_initial=="uniform": m.setup_initial_condition("uniform",p=prec_i,chi=chi,beta=beta) else: m.setup_initial_condition(Vs_initial) m.setup['verbose']=verbose yr=m.p['conv_T_to_t'] # Converging on the first solution using integration and then root Vs_init = m.integrate(m.initial_state,check_convergence=True, max_time=first_time*yr,p=prec_i,chi=chi,beta=beta) # Es['rhs']="oz_EQK_relax" # m = bwhModel(Vs=Vs_initial,Es=Es,Ps=Ps) # m.setup['verbose']=verbose Vs = Vs_init.copy() b_sol = np.zeros((len(prec_gradient_down),n[0],n[1])) w_sol = np.zeros((len(prec_gradient_down),n[0],n[1])) h_sol = np.zeros((len(prec_gradient_down),n[0],n[1])) if create_movie and savefile is not None: savefile_base=savefile for i,prec in enumerate(prec_gradient_down): print("Integration for p =",prec) if create_movie and savefile is not None: savefile=savefile_base+"_p{:4.3f}".format(prec).replace(".","_") b,w,h=m.split_state(Vs) if add_noise is not None: b=b+add_noise*np.random.random(size=b.shape) w=w+add_noise*np.random.random(size=w.shape) # h=h+add_noise*np.random.random(size=h.shape) Vs = np.ravel((b,w,h)) Vs_new=m.integrate(initial_state=Vs, max_time=delta_year*yr,step=yr, check_convergence=False, savefile=savefile,create_movie=False, p=prec,chi=chi,beta=beta) if rhs!="oz_EQK": if m.converged_relaxation==False: time,result=m.pseudo_spectral_integrate(initial_state=Vs, finish=delta_year*yr, step=yr, p=prec,chi=chi,beta=beta) Vs_new=result[-1] b,w,h=m.split_state(Vs_new) b_sol[i]=b w_sol[i]=w h_sol[i]=h Vs = np.ravel((b,w,h)) dd.save(fname+".hdf5",{'p':prec_gradient_down,"T":time_span, 'chi':chi,'beta':beta, 'Ps_dimensional':m.p['dimpar'], 'n':n,'l':l, 'b':b_sol, 'w':w_sol, 'h':h_sol}) if send_email is not None: try: import smtplib from socket import gaierror server = smtplib.SMTP('smtp.gmail.com', 587) server.ehlo() server.starttls() server.login(send_email, pswd) msg = "\r\n".join([ "From: {}".format(send_email), "To: {}".format(send_email), "Subject: Drought bwh simulations finished", "", "Drought bwh simulations finished and saved to:", fname ]) server.sendmail(send_email, send_email, msg) server.quit() except gaierror: pass
'jet_trk_nsplitPixHits', 'jet_trk_nSCTHits', 'jet_trk_nsharedSCTHits', 'jet_trk_expectBLayerHit'] # 2 more to be added in `process_data` print 'Loading dataframes...' # -- currently only training and testing on one file each! trk_train = pup.root2panda( './data/train/*410000_00*.root', 'JetCollection', branches = track_inputs + ['jet_truthflav' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi'] ) trk_test = pup.root2panda( './data/test/*410000*.root', 'JetCollection', branches = track_inputs + ['jet_truthflav' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi'] ) print 'Processing training sample ...' train_dict = process_data(trk_train, savevars=True) del trk_train io.save('./data/train_dict_IPConv.h5', train_dict) print 'Processing test sample...' test_dict = process_data(trk_test) del trk_test io.save('./data/test_dict_IPConv.h5', test_dict)
fig = plt.figure(figsize=(11.69, 8.27), dpi=100) bins = np.linspace( min(min(ttbar['NMuon']), min(qcd['NMuon']), min(wjets['NMuon'])), max(max(ttbar['NMuon']), max(qcd['NMuon']), max(wjets['NMuon'])), 10) _ = plt.hist( [ttbar['NMuon'], qcd['NMuon'], wjets['NMuon']], stacked=True, label=[r'$t\overline{t}$', 'QCD', 'wjets'], alpha=0.5, histtype='stepfilled', normed=False, bins=bins, weights=[ttbar['EventWeight'], qcd['EventWeight'], wjets['EventWeight']]) plt.xlabel('NMuon') plt.ylabel('Number of Events') plt.yscale('log') plt.legend() plt.plot() plt.savefig('task3.pdf') io.save('wj_nmuons.h5', wjets['NMuon']) #new_df = io.load('wj_nmuons.h5') #print new_df pickle.dump(wjets['NMuon'], open('wj_nmuons.pkl', 'wb')) #test = pickle.load(open('wj_nmuons.pkl', 'rb')) #print test
def save(self, path): d = self.save_to_dict() d["name"] = self.name io.save(path, d)
If the event has fewer tracks, use padding; if is has more, only consider the first ntrk") parser.add_argument('--inputs', default='grade', help='one of: hits, grade') args = parser.parse_args() track_inputs, jet_inputs = generate_inputlist(args.inputs) print 'Loading dataframes...' # -- currently only training and testing on one file each! trk_train = pup.root2panda( os.path.join('data', 'train', args.train_files), 'bTag_AntiKt4EMTopoJets', branches = track_inputs + jet_inputs ) trk_test = pup.root2panda( os.path.join('data', 'test', args.test_files), 'bTag_AntiKt4EMTopoJets', branches = track_inputs + jet_inputs ) print 'Processing training sample ...' train_dict = process_data(trk_train, jet_inputs, args.ntrk, args.sort_by, args.output, args.inputs, savevars=True) del trk_train io.save(os.path.join('data', 'train_dict_' + args.output + '.h5'), train_dict) print 'Processing test sample...' test_dict = process_data(trk_test, jet_inputs, args.ntrk, args.sort_by, args.output, args.inputs,) del trk_test io.save(os.path.join('data', 'test_dict_' + args.output + '.h5'), test_dict)
def main(embed_size, normed, input_id, run_name): configure_logging() logger = logging.getLogger("RNNIP Training") logger.info("Loading hdf5's") test_dict = io.load(os.path.join('data', 'test_dict_' + input_id + '.h5')) train_dict = io.load(os.path.join('data', 'train_dict_' + input_id + '.h5')) X_train_stream0 = train_dict['grade'] X_train_stream1 = train_dict['X'] y_train = train_dict['y'] X_test_stream0 = test_dict['grade'] X_test_stream1 = test_dict['X'] y_test = test_dict['y'] ip3d = test_dict['ip3d'] logger.info('Building model') model = build_model(X_train_stream0, X_train_stream1, embed_size, normed) model.summary() logger.info('Compiling model') model.compile('adam', 'categorical_crossentropy', metrics=['accuracy']) #-- if the pre-trained model exists, load it in, otherwise start from scratch safe_mkdir('weights') weights_file = os.path.join('weights', 'rnnip_' + run_name +'.h5') try: model.load_weights(weights_file) logger.info('Loaded pre-trained model from ' + weights_file) except IOError: logger.info('No pre-trained model found in ' + weights_file) logger.info('Training:') try: model.fit([X_train_stream0, X_train_stream1], y_train, batch_size=512, callbacks = [ EarlyStopping(verbose=True, patience=20, monitor='val_loss'), ModelCheckpoint( weights_file, monitor='val_loss', verbose=True, save_best_only=True ) ], epochs=300, validation_split = 0.2) except KeyboardInterrupt: logger.info('Training ended early.') # -- load in best network logger.info('Loading best epoch') model.load_weights(weights_file) json_string = model.to_json() safe_mkdir('json_models') open(os.path.join('json_models', run_name +'.json'), 'w').write(json_string) logger.info('Testing') safe_mkdir('predictions') yhat = model.predict([X_test_stream0, X_test_stream1], verbose=True, batch_size=10000) io.save(os.path.join('predictions', 'yhat'+ run_name +'.h5'), yhat) logger.info('Plotting ROC') plot_ROC(y_test, yhat, ip3d, run_name)
""" return ( self.cells[tup] for tup in self.cells if self.cells[tup] != -1 and self.cell_distance(cell, tup) ) def update(self, point, index): """updates the grid with the new point Parameters ---------- point : index : Returns ------- """ self.cells[self.cellify(point)] = index def __str__(self): return self.cells.__str__() if __name__ == "__main__": bg = 255 - poisson_disk_background((640, 640), 12, 2) dio.save("poisson_dense.h5", bg)
def main(inputfiles, treename, ftrain, max_n_pairs, exclude_list): ''' Args: ----- inputfiles: list of strings with the paths to root files treename: string, name of the TTree that contains the branches ftrain: float in range [0, 1], training fraction max_n_pairs: int, maximum number of jet pairs to consider per event exclude_list: Returns: -------- ''' # -- configure logging utils.configure_logging() logger = logging.getLogger('main') # -- concatenate all files into a pandas df short_filenames = [f.split('/')[-1] for f in inputfiles] logger.info('Creating pandas dataframes from: {}'.format(short_filenames)) #df = pd.concat([pup.root2panda(f, treename) for f in inputfiles], ignore_index=True) df_list = [] for f in inputfiles: df_temp = pup.root2panda(f, treename) df_temp['sample'] = f.split('/')[-1].split('.')[0] df_list.append(df_temp) df = pd.concat(df_list, ignore_index=True) # -- remove events with more than one correct jet pair # -- because that shouldn't happen and complicates the task # -- of finding the correct jet pair logger.info('Removing events with more than one correct jet pair') keep = np.array([sum(yen) for yen in df['isCorrect'].values]) <= 1 df = df[keep].reset_index(drop=True) # -- target logger.info('Building one-hot target') y = df['isCorrect'].values # -- extract array of names of sample of origin sample = df['sample'].values # -- prepend 1 to all entries in y where there is no correct jet pair, # -- 0 if there exists a correct jet pair already # -- each entry in y will now have length (n_jet_pairs + 1) y_long = np.array([ np.insert(yev, 0, 1) if sum(yev) == 0 else np.insert(yev, 0, 0) for yev in y ]) # -- weights logger.info('Extracting weights from event_weight') w = df['event_weight'].values del df['event_weight'], df['isCorrect'], df['sample'] df = df.drop(exclude_list, axis=1) # maybe in the future do something # better with these variables instead of just removing them # -- matrix of predictors X = df.values ix = range(X.shape[0]) varlist = df.columns.values.tolist() # -- maximum number of jet pairs to consider in each event # -- can be set to whatever number makes sense #max_length = max([len(b) for b in df['Delta_eta_jb']]) + 1 max_length = max_n_pairs + 1 logger.info( 'The max number of jet pairs per event will be {}'.format(max_n_pairs)) X_train, X_test, y_train, y_test, w_train, w_test,\ sample_train, sample_test, ix_train, ix_test, scaler_list = shuffle_split_scale_pad( X, y_long, w, sample, ix, ftrain, max_length ) logger.info('Saving processed data as hdf5 in data/') io.save( os.path.join('data', 'train_dict.hdf5'), { 'X': X_train, 'y': y_train, 'w': w_train, 'ix': ix_train, 'vars': varlist, 'sample': sample_train.tolist(), 'scalers': scaler_list }) io.save( os.path.join('data', 'test_dict.hdf5'), { 'X': X_test, 'y': y_test, 'w': w_test, 'ix': ix_test, 'vars': varlist, 'sample': sample_test.tolist(), 'scalers': scaler_list })
'jet_trk_expectBLayerHit' ] # 2 more to be added in `process_data` print 'Loading dataframes...' # -- currently only training and testing on one file each! trk_train = pup.root2panda( './data/train/*410000_00*.root', 'JetCollection', branches=track_inputs + [ 'jet_truthflav', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi' ]) trk_test = pup.root2panda( './data/test/*410000*.root', 'JetCollection', branches=track_inputs + [ 'jet_truthflav', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi' ]) print 'Processing training sample ...' train_dict = process_data(trk_train, savevars=True) del trk_train io.save('./data/train_dict_IPConv.h5', train_dict) print 'Processing test sample...' test_dict = process_data(trk_test) del trk_test io.save('./data/test_dict_IPConv.h5', test_dict)
def write_current_state(self): """ Write the current state of the sampler to disk. The required information to reconstruct the state of the run are written to an hdf5 file. All but the most recent removed live point in the chain are removed from the sampler to reduce memory usage. This means it is necessary to not append the first live point to the file if updating a previous checkpoint. Parameters ---------- sampler: `dynesty.NestedSampler` NestedSampler to write to disk. """ check_directory_exists_and_if_not_mkdir(self.outdir) resume_file = '{}/{}_resume.h5'.format(self.outdir, self.label) if os.path.isfile(resume_file): saved = load(resume_file) current_state = dict( unit_cube_samples=np.vstack( [saved['unit_cube_samples'], self.sampler.saved_u[1:]]), physical_samples=np.vstack( [saved['physical_samples'], self.sampler.saved_v[1:]]), sample_likelihoods=np.concatenate( [saved['sample_likelihoods'], self.sampler.saved_logl[1:]]), sample_log_volume=np.concatenate([ saved['sample_log_volume'], self.sampler.saved_logvol[1:] ]), sample_log_weights=np.concatenate([ saved['sample_log_weights'], self.sampler.saved_logwt[1:] ]), cumulative_log_evidence=np.concatenate([ saved['cumulative_log_evidence'], self.sampler.saved_logz[1:] ]), cumulative_log_evidence_error=np.concatenate([ saved['cumulative_log_evidence_error'], self.sampler.saved_logzvar[1:] ]), cumulative_information=np.concatenate([ saved['cumulative_information'], self.sampler.saved_h[1:] ]), id=np.concatenate([saved['id'], self.sampler.saved_id[1:]]), it=np.concatenate([saved['it'], self.sampler.saved_it[1:]]), nc=np.concatenate([saved['nc'], self.sampler.saved_nc[1:]]), boundidx=np.concatenate( [saved['boundidx'], self.sampler.saved_boundidx[1:]]), bounditer=np.concatenate( [saved['bounditer'], self.sampler.saved_bounditer[1:]]), scale=np.concatenate( [saved['scale'], self.sampler.saved_scale[1:]]), ) else: current_state = dict( unit_cube_samples=self.sampler.saved_u, physical_samples=self.sampler.saved_v, sample_likelihoods=self.sampler.saved_logl, sample_log_volume=self.sampler.saved_logvol, sample_log_weights=self.sampler.saved_logwt, cumulative_log_evidence=self.sampler.saved_logz, cumulative_log_evidence_error=self.sampler.saved_logzvar, cumulative_information=self.sampler.saved_h, id=self.sampler.saved_id, it=self.sampler.saved_it, nc=self.sampler.saved_nc, boundidx=self.sampler.saved_boundidx, bounditer=self.sampler.saved_bounditer, scale=self.sampler.saved_scale, ) current_state.update(ncall=self.sampler.ncall, live_logl=self.sampler.live_logl, iteration=self.sampler.it - 1, live_u=self.sampler.live_u, live_v=self.sampler.live_v, nlive=self.sampler.nlive, live_bound=self.sampler.live_bound, live_it=self.sampler.live_it, added_live=self.sampler.added_live) weights = np.exp(current_state['sample_log_weights'] - current_state['cumulative_log_evidence'][-1]) current_state[ 'posterior'] = self.external_sampler.utils.resample_equal( np.array(current_state['physical_samples']), weights) save(resume_file, current_state) self.sampler.saved_id = [self.sampler.saved_id[-1]] self.sampler.saved_u = [self.sampler.saved_u[-1]] self.sampler.saved_v = [self.sampler.saved_v[-1]] self.sampler.saved_logl = [self.sampler.saved_logl[-1]] self.sampler.saved_logvol = [self.sampler.saved_logvol[-1]] self.sampler.saved_logwt = [self.sampler.saved_logwt[-1]] self.sampler.saved_logz = [self.sampler.saved_logz[-1]] self.sampler.saved_logzvar = [self.sampler.saved_logzvar[-1]] self.sampler.saved_h = [self.sampler.saved_h[-1]] self.sampler.saved_nc = [self.sampler.saved_nc[-1]] self.sampler.saved_boundidx = [self.sampler.saved_boundidx[-1]] self.sampler.saved_it = [self.sampler.saved_it[-1]] self.sampler.saved_bounditer = [self.sampler.saved_bounditer[-1]] self.sampler.saved_scale = [self.sampler.saved_scale[-1]]
def process(i, filepath, yaml_file, model_id): ''' ''' import pandautils as pup # -- load branches from yaml file branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features(yaml_file) logger = logging.getLogger("ETL Service") # -- load root file to dataframe logger.info('Operating on {}'.format(filepath)) logger.info('Creating dataframe...') df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches) # -- create MV2 input quantities, set default values logger.info('Transforming variables...') df = transformVars(df) # -- flatten to jet-flat structure logger.info('Flattening df...') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()}) del df # --apply standard cuts on AntiKT4EMTopoJets logger.info('Applying cuts...') df_flat = apply_calojet_cuts(df_flat) # -- create numpy arrays for ML logger.info('Creating X, y, w, mv2c10...') y = df_flat['jet_LabDr_HadF'].values mv2c10 = df_flat['jet_mv2c10'].values jet_pt = df_flat['jet_pt'].values ip3d_vars = df_flat[ip3d_training_vars].values ipmp_vars = df_flat[ipmp_training_vars].values # -- slice df by only keeping the training variables X = df_flat[training_vars].values # -- Find weights by reweighting to the light distribution pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values w = reweight_to_l(pteta, y, pt_col=0, eta_col=1) del df_flat, pteta # -- shuffle data, split into train and test logger.info('Shuffling, splitting, scaling...') ix = np.array(range(len(y))) X_train, X_test,\ y_train, y_test,\ w_train, w_test,\ ix_train, ix_test, \ mv2c10_train, mv2c10_test,\ jet_pt_train, jet_pt_test,\ ip3d_vars_train, ip3d_vars_test,\ ipmp_vars_train, ipmp_vars_test = train_test_split( X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6 ) # -- scale inputs to 0 mean, 1 std scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ip3d_vars_train = scaler.fit_transform(ip3d_vars_train) ip3d_vars_test = scaler.transform(ip3d_vars_test) ipmp_vars_train = scaler.fit_transform(ipmp_vars_train) ipmp_vars_test = scaler.transform(ipmp_vars_test) # -- split the previously selected training data into train and validate X_train, X_validate,\ y_train, y_validate,\ w_train, w_validate,\ ix_train, ix_validate,\ mv2c10_train, mv2c10_validate,\ jet_pt_train, jet_pt_validate,\ ip3d_vars_train, ip3d_vars_validate,\ ipmp_vars_train, ipmp_vars_validate = train_test_split( X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7 ) # -- assign train, test, validate data to dictionaries train = { 'X' : X_train, 'ip3d_vars': ip3d_vars_train, 'ipmp_vars': ipmp_vars_train, 'y' : y_train, 'w' : w_train, 'ix': ix_train, 'mv2c10': mv2c10_train, 'pt': jet_pt_train } test = { 'X' : X_test, 'ip3d_vars': ip3d_vars_test, 'ipmp_vars': ipmp_vars_test, 'y' : y_test, 'w' : w_test, 'ix': ix_test, 'mv2c10': mv2c10_test, 'pt': jet_pt_test } validate = { 'X' : X_validate, 'ip3d_vars': ip3d_vars_validate, 'ipmp_vars': ipmp_vars_validate, 'y' : y_validate, 'w' : w_validate, 'ix': ix_validate, 'mv2c10': mv2c10_validate, 'pt': jet_pt_validate } # -- save dictionaries to hdf5 logger.info('Saving dictionaries to hdf5...') hdf5_train_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-train-db.h5') hdf5_test_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-test-db.h5') hdf5_validate_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-validate-db.h5') io.save(hdf5_train_path, train) io.save(hdf5_test_path, test) io.save(hdf5_validate_path, validate) logger.debug('Saved hdf5 archives: {}, {}, {}'. format(hdf5_train_path, hdf5_test_path, hdf5_validate_path)) return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
def process(i, filepath, yaml_file, model_id): ''' ''' import pandautils as pup # -- load branches from yaml file branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features( yaml_file) logger = logging.getLogger("ETL Service") # -- load root file to dataframe logger.info('Operating on {}'.format(filepath)) logger.info('Creating dataframe...') df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches) # -- create MV2 input quantities, set default values logger.info('Transforming variables...') df = transformVars(df) # -- flatten to jet-flat structure logger.info('Flattening df...') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()}) del df # --apply standard cuts on AntiKT4EMTopoJets logger.info('Applying cuts...') df_flat = apply_calojet_cuts(df_flat) # -- create numpy arrays for ML logger.info('Creating X, y, w, mv2c10...') y = df_flat['jet_LabDr_HadF'].values mv2c10 = df_flat['jet_mv2c10'].values jet_pt = df_flat['jet_pt'].values ip3d_vars = df_flat[ip3d_training_vars].values ipmp_vars = df_flat[ipmp_training_vars].values # -- slice df by only keeping the training variables X = df_flat[training_vars].values # -- Find weights by reweighting to the light distribution pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values w = reweight_to_l(pteta, y, pt_col=0, eta_col=1) del df_flat, pteta # -- shuffle data, split into train and test logger.info('Shuffling, splitting, scaling...') ix = np.array(range(len(y))) X_train, X_test,\ y_train, y_test,\ w_train, w_test,\ ix_train, ix_test, \ mv2c10_train, mv2c10_test,\ jet_pt_train, jet_pt_test,\ ip3d_vars_train, ip3d_vars_test,\ ipmp_vars_train, ipmp_vars_test = train_test_split( X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6 ) # -- scale inputs to 0 mean, 1 std scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ip3d_vars_train = scaler.fit_transform(ip3d_vars_train) ip3d_vars_test = scaler.transform(ip3d_vars_test) ipmp_vars_train = scaler.fit_transform(ipmp_vars_train) ipmp_vars_test = scaler.transform(ipmp_vars_test) # -- split the previously selected training data into train and validate X_train, X_validate,\ y_train, y_validate,\ w_train, w_validate,\ ix_train, ix_validate,\ mv2c10_train, mv2c10_validate,\ jet_pt_train, jet_pt_validate,\ ip3d_vars_train, ip3d_vars_validate,\ ipmp_vars_train, ipmp_vars_validate = train_test_split( X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7 ) # -- assign train, test, validate data to dictionaries train = { 'X': X_train, 'ip3d_vars': ip3d_vars_train, 'ipmp_vars': ipmp_vars_train, 'y': y_train, 'w': w_train, 'ix': ix_train, 'mv2c10': mv2c10_train, 'pt': jet_pt_train } test = { 'X': X_test, 'ip3d_vars': ip3d_vars_test, 'ipmp_vars': ipmp_vars_test, 'y': y_test, 'w': w_test, 'ix': ix_test, 'mv2c10': mv2c10_test, 'pt': jet_pt_test } validate = { 'X': X_validate, 'ip3d_vars': ip3d_vars_validate, 'ipmp_vars': ipmp_vars_validate, 'y': y_validate, 'w': w_validate, 'ix': ix_validate, 'mv2c10': mv2c10_validate, 'pt': jet_pt_validate } # -- save dictionaries to hdf5 logger.info('Saving dictionaries to hdf5...') hdf5_train_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) + '-train-db.h5') hdf5_test_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) + '-test-db.h5') hdf5_validate_path = os.path.join( '..', 'data', 'DL1-' + model_id + str(i) + '-validate-db.h5') io.save(hdf5_train_path, train) io.save(hdf5_test_path, test) io.save(hdf5_validate_path, validate) logger.debug('Saved hdf5 archives: {}, {}, {}'.format( hdf5_train_path, hdf5_test_path, hdf5_validate_path)) return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
def main(inputfiles, treename='bTag_AntiKt2PV0TrackJets'): configure_logging() logger = logging.getLogger('ProcessTrackJetData') # -- import root files into df logger.info('Importing ROOT files into pandas dataframes') df = pup.root2panda( inputfiles, treename, branches=[ 'jet_pt', 'jet_eta', 'jet_phi', 'jet_m', 'jet_ip2d_pu', 'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc', 'jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_sig3d', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz', 'jet_aliveAfterOR', 'jet_aliveAfterORmu', 'jet_nConst', 'jet_LabDr_HadF' ]) # -- Insert default values, calculate MV2 variables from the branches in df logger.info('Creating MV2 variables') df = transformVars(df) # -- Flatten from event-flat to jet-flat # -- Before doing so, remove event-level variables such as PVx,y,z logger.info('Flattening dataframe') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()}) # -- apply eta, pt, OR cuts from b-tagging recommendations logger.info('Applying cuts') df_flat = applycuts(df_flat) # -- build X, y, w # -- target values y = df_flat['jet_LabDr_HadF'].values # -- slice df by only keeping the 24 variables for MV2 training training_vars = [ 'jet_pt', 'abs(jet_eta)', 'jet_ip2', 'jet_ip2_c', 'jet_ip2_cu', 'jet_ip3', 'jet_ip3_c', 'jet_ip3_cu', 'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_Lxy', 'jet_sv1_L3d', 'jet_sv1_sig3d', 'jet_sv1_dR', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_dR', 'jet_jf_sig3d' ] X = df_flat[training_vars].as_matrix() logger.info( '2D pT and eta reweighting of charm and light to bottom distribution') w = reweight_to_b(X, y) X, y, w = remove_tau(X, y, w) # -- turn classes 0, 4, 5, 15 to 0, 1, 2, 3 # le = LabelEncoder() # y = le.fit_transform(y) # -- randomly shuffle and split into train and test set logger.info('Shuffling and splitting') X_train, X_test, y_train, y_test, w_train, w_test = train_test_split( X, y, w, train_size=0.6) # -- save out to hdf5 logger.info('Saving data to hdf5') io.save(open('train_data.h5', 'wb'), { 'X': X_train, 'y': y_train, 'w': w_train }) io.save(open('test_data.h5', 'wb'), { 'X': X_test, 'y': y_test, 'w': w_test })
def save(self, path): d = self.save_to_dict() d['name'] = self.name io.save(path, d)
#use : $python n2a.py /path/[ntupleName].root import numpy as np from numpy.lib.recfunctions import stack_arrays from ROOT import * from root_numpy import tree2array import glob import pandas as pd import deepdish.io as io import sys arg = sys.argv[1] #arg = ntuple.root input_ntuple = TFile.Open(arg) input_ntuple_tree = input_ntuple.Get('dnn_input') input_ntuple_array = tree2array(input_ntuple_tree) input_ntuple_df = pd.DataFrame(input_ntuple_array) #a = [ntuple].root (delete path) for a in arg.split('/'): continue a = a.replace("ana","") #a = [ntupleName].root a = a.replace(".root", ".h5") #a = [ntupleName].h5 #saved as [ntupleName].h5 io.save(a, input_ntuple_df)
def main(inputfiles, treename='bTag_AntiKt2PV0TrackJets'): configure_logging() logger = logging.getLogger('ProcessTrackJetData') # -- import root files into df logger.info('Importing ROOT files into pandas dataframes') df = pup.root2panda(inputfiles, treename, branches = [ 'jet_pt', 'jet_eta','jet_phi', 'jet_m', 'jet_ip2d_pu', 'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc','jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv', 'jet_sv1_m','jet_sv1_efc','jet_sv1_n2t','jet_sv1_sig3d', 'jet_jf_n2t','jet_jf_ntrkAtVx','jet_jf_nvtx','jet_jf_nvtx1t','jet_jf_m', 'jet_jf_efc','jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz', 'jet_aliveAfterOR', 'jet_aliveAfterORmu', 'jet_nConst', 'jet_LabDr_HadF']) # -- Insert default values, calculate MV2 variables from the branches in df logger.info('Creating MV2 variables') df = transformVars(df) # -- Flatten from event-flat to jet-flat # -- Before doing so, remove event-level variables such as PVx,y,z logger.info('Flattening dataframe') df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True) df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()}) # -- apply eta, pt, OR cuts from b-tagging recommendations logger.info('Applying cuts') df_flat = applycuts(df_flat) # -- build X, y, w # -- target values y = df_flat['jet_LabDr_HadF'].values # -- slice df by only keeping the 24 variables for MV2 training training_vars = [ 'jet_pt', 'abs(jet_eta)', 'jet_ip2', 'jet_ip2_c', 'jet_ip2_cu', 'jet_ip3', 'jet_ip3_c', 'jet_ip3_cu', 'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_Lxy', 'jet_sv1_L3d', 'jet_sv1_sig3d', 'jet_sv1_dR', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_dR', 'jet_jf_sig3d'] X = df_flat[training_vars].as_matrix() logger.info('2D pT and eta reweighting of charm and light to bottom distribution') w = reweight_to_b(X, y) X, y, w = remove_tau(X, y, w) # -- turn classes 0, 4, 5, 15 to 0, 1, 2, 3 # le = LabelEncoder() # y = le.fit_transform(y) # -- randomly shuffle and split into train and test set logger.info('Shuffling and splitting') X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, train_size = 0.6) # -- save out to hdf5 logger.info('Saving data to hdf5') io.save(open('train_data.h5', 'wb'), {'X' : X_train, 'y' : y_train, 'w' : w_train}) io.save(open('test_data.h5', 'wb'), {'X' : X_test, 'y' : y_test, 'w' : w_test})
import matplotlib import matplotlib.pyplot as plt import cPickle as pickle import deepdish.io as io ttbar = root2array('ttbar.root') qcd = root2array('qcd.root') wjets = root2array('wjets.root') matplotlib.rcParams.update({'font.size': 16}) fig = plt.figure(figsize=(11.69, 8.27), dpi=100) bins = np.linspace(min(min(ttbar['NMuon']), min(qcd['NMuon']), min(wjets['NMuon'])), max(max(ttbar['NMuon']), max(qcd['NMuon']), max(wjets['NMuon'])), 10) _ = plt.hist([ttbar['NMuon'], qcd['NMuon'], wjets['NMuon']], stacked=True, label=[r'$t\overline{t}$', 'QCD', 'wjets'], alpha = 0.5, histtype='stepfilled', normed=False, bins=bins, weights=[ttbar['EventWeight'], qcd['EventWeight'], wjets['EventWeight']]) plt.xlabel('NMuon') plt.ylabel('Number of Events') plt.yscale('log') plt.legend() plt.plot() plt.savefig('task3.pdf') io.save('wj_nmuons.h5', wjets['NMuon']) #new_df = io.load('wj_nmuons.h5') #print new_df pickle.dump(wjets['NMuon'], open('wj_nmuons.pkl', 'wb')) #test = pickle.load(open('wj_nmuons.pkl', 'rb')) #print test