Beispiel #1
0
def saveClassAttributes(clss, form, save_addr):
    """ Save class attributes.

    **Parameters**\n
    clss: instance
        Handle of the instance to be saved.
    form: str
        Format to save in ('h5'/'hdf5', 'mat', or 'dmp'/'dump').
    save_addr: str
        The address to save the attributes in.
    """

    save_addr = u.appendformat(save_addr, form)

    if form == 'mat':
        sio.savemat(save_addr, clss.__dict__)

    elif form in ('h5', 'hdf5'):
        try:
            dictdump.dicttoh5(clss.__dict__, save_addr)
        except:
            dio.save(save_addr, clss.__dict__, compression=None)

    elif form in ('dmp', 'dump'):
        fh = open(save_addr, 'wb')
        pickle.dump(clss, fh)
        fh.close()

    else:
        raise NotImplementedError
Beispiel #2
0
def saveClassAttributes(clss, form, save_addr):
    """ Save class attributes.

    :Parameters:
        clss : instance
            Handle of the instance to be saved.
        form : str
            Format to save in ('h5' or 'mat').
        save_addr : str
            The address to save the attributes in.
    """

    save_addr = u.appendformat(save_addr, form)

    if form == 'mat':
        sio.savemat(save_addr, clss.__dict__)

    elif form in ('h5', 'hdf5'):
        try:
            dictdump.dicttoh5(clss.__dict__, save_addr)
        except:
            dio.save(save_addr, clss.__dict__, compression=None)

    else:
        raise NotImplementedError
Beispiel #3
0
def test(net, data):
	'''
	Args:
	-----
		net: a trained keras Model instance
		data: an OrderedDict containing all X, y, w ndarrays for all particles (both train and test), e.g.:
              data = {
                "X_jet_train" : X_jet_train,
                "X_jet_test" : X_jet_test,
                "X_photon_train" : X_photon_train,
                "X_photon_test" : X_photon_test,
                "y_train" : y_train,
                "y_test" : y_test,
                "w_train" : w_train,
                "w_test" : w_test
              }
    Returns:
    --------
    	yhat: numpy array of dim [n_ev, n_classes] with the net predictions on the test data 
	'''
	yhat = net.predict([data['X_jet_test'], data['X_photon_test'], data['X_event_test']], 
		verbose = True, batch_size = 512)
	io.save(open(os.path.join('output','yhat.h5'), 'wb'), yhat)
	# -- example of other quantities that can be evaluated from yhat
	#class_predictions = [np.argmax(ev) for ev in yhat])
	return yhat
Beispiel #4
0
 def save_pde_sol(self, fname, sol, t=None):
     b, w = self.split_sol(sol)
     data = {}
     data['b'] = b
     data['w'] = w
     data['Ps'] = self.p
     data['Es'] = self.setup
     if t is not None:
         data['t'] = t
     dd.save(fname + '.hdf5', data, compression='blosc')
Beispiel #5
0
    def save(self, path):
        """
        Saves an instance of the class using :func:`deepdish.io.save`.

        Parameters
        ----------
        path : str
            Output path to HDF5 file.
        """
        io.save(path, self.save_to_dict())
Beispiel #6
0
    def save(self, path):
        """
        Saves an instance of the class using `deepdish.io.save`.

        Parameters
        ----------
        path : str
            Output path to HDF5 file.
        """
        io.save(path, self.save_to_dict())
Beispiel #7
0
def saveParmSet(fname, par, text=None, saveroot=False):
    from deepdish.io import save
    if saveroot:
        from tlmModel import tlmModel, Es_normal
        m = tlmModel(Es=Es_normal, Ps=par, Vs=None)
        t, result = m.ode_integrate([0.9, 0.2, 0.2])
        b, w, h = result.T[-1]
        par['b'] = b
        par['w'] = w
        par['h'] = h
    if text is not None:
        par['text'] = text
    save("./auto/" + fname + ".hdf5", par)
Beispiel #8
0
def test():
    '''
    '''
    data = io.load(open('test_data.h5', 'rb'))
    #data = remove_tau(data)

    # -- Load scikit classifier
    classifier = joblib.load('sklBDT_trk2.pkl')
    
    # -- Get classifier predictions
    yhat = classifier.predict_proba(data['X'])[:, 2]

    io.save(open('yhat_test.h5', 'wb'), yhat)
Beispiel #9
0
def saveParmSet(fname, par, text=None, saveroot=False):
    from deepdish.io import save
    if saveroot:
        from tlmModel import tlmModel, Es_normal
        Es_normal['verbose'] = False
        m = tlmModel(Es=Es_normal, Ps=par, Vs=None)
        t, result = m.ode_integrate([0.9, 0.3, 0.3])
        b, s1, s2 = result[-1]
        par['b'] = b
        par['s1'] = s1
        par['s2'] = s2
        print b, s1, s2
    if text is not None:
        par['text'] = text
    save("./auto/" + fname + ".hdf5", par)
Beispiel #10
0
def pinv(x):
    # This could be a numpy issue, since the same matrix works fine in Matlab.
    # https://github.com/numpy/numpy/issues/1588
    try:
        res = np.linalg.pinv(x)
    except np.linalg.LinAlgError as err:
        n_retry = 0
        while True:
            try:
                res = np.linalg.pinv(x + np.random.randn(*x.shape) * 1e-15)
                break
            except np.linalg.LinAlgError:
                n_retry += 1
                if n_retry == 3:
                    from deepdish import io as dio
                    dio.save('debug.hdf', dict(x=x))
                    raise err
    return res
Beispiel #11
0
def main():
    opts = parse_options()

    inFile = opts.inputFile
    tree = opts.treeName

    df = root2pandas(inFile, tree)
    # -- save a pandas df to hdf5 (better to first convert it back to ndarray, to be fair)

    import deepdish.io as io
    outFile = inFile.replace(".root", ".h5")
    io.save(outFile, df)

    # -- let's load it back in to make sure it actually worked!
    new_df = io.load(outFile)

    # -- check the shape again -- nice check to run every time you create a df
    print "File check!"
    print "(Number of events, Number of branches): ", new_df.shape
Beispiel #12
0
def main(MODEL_FILE):

    print "Loading hdf5's..."
    test_dict = io.load('./data/test_dict_IPConv_ntuple_'+ RUN_NAME +'.h5')
    train_dict = io.load('./data/train_dict_IPConv_ntuple_'+ RUN_NAME +'.h5')
    
    X_train = train_dict['X']
    y_train = train_dict['y']    

    X_test = test_dict['X']
    y_test = test_dict['y']
    n_features = X_test.shape[2]

    # this is a df
    ip3d = test_dict['ip3d'] 

    print 'Building model...'
    
    if (MODEL_FILE == 'CRNN'):
        graph = build_graph(n_features)

        model = Sequential()

        model.add(graph)
        # remove Maxout for tensorflow
        model.add(MaxoutDense(64, 5, input_shape=graph.nodes['dropout'].output_shape[1:]))
        model.add(Dense(64))

    elif (MODEL_FILE == 'RNN'):

        model = Sequential()
        model.add(Masking(mask_value=-999, input_shape=(N_TRACKS, n_features)))
        model.add(GRU(25))#, input_shape=(N_TRACKS, n_features))) #GRU
        model.add(Dropout(0.2)) #0.2
    
        # remove Maxout for tensorflow
        model.add(MaxoutDense(64, 5))  #, input_shape=graph.nodes['dropout'].output_shape[1:]))
        model.add(Dense(64))

  
    model.add(Dropout(0.4))

    model.add(Highway(activation = 'relu'))

    model.add(Dropout(0.3))
    model.add(Dense(4))

    model.add(Activation('softmax'))

    print 'Compiling model...'
    model.compile('adam', 'categorical_crossentropy')
    model.summary()

    print 'Training:'
    try:
        model.fit(X_train, y_train, batch_size=512,
            callbacks = [
                EarlyStopping(verbose=True, patience=20, monitor='val_loss'),
                ModelCheckpoint(MODEL_FILE + RUN_NAME +'-progress', monitor='val_loss', verbose=True, save_best_only=True)
            ],
        nb_epoch=100, 
        validation_split = 0.2, 
        show_accuracy=True) 
        
    except KeyboardInterrupt:
        print 'Training ended early.'

    # -- load in best network
    model.load_weights(MODEL_FILE + RUN_NAME +'-progress')
    
    if (SAVE_PROTOBUF):
        print 'Saving protobuf'
        # write out to a new directory called models
        # the actual graph file is graph.pb
        # the graph def is in the global session
        import tensorflow as tf
        import keras.backend.tensorflow_backend as tfbe

        sess = tfbe._SESSION

        saver = tf.train.Saver()
        tf.train.write_graph(sess.graph_def, 'models/', 'graph.pb', as_text=False)    

        save_path = saver.save(sess, "./model-weights.ckpt")
        print "Model saved in file: %s" % save_path
        
        print saver.as_saver_def().filename_tensor_name
        print saver.as_saver_def().restore_op_name

        print model.get_output()

    print 'Saving weights...'
    model.save_weights('./weights/ip3d-replacement_' + MODEL_FILE + RUN_NAME +'.h5', overwrite=True)

    json_string = model.to_json()
    open(MODEL_FILE + RUN_NAME +'.json', 'w').write(json_string)

    print 'Testing...'
    yhat = model.predict(X_test, verbose = True, batch_size = 512) 
    io.save('yhat'+ RUN_NAME +'.h5', yhat) 
     
    print 'Plotting ROC...'
    fg = plot_ROC(y_test, yhat, ip3d, MODEL_FILE)
    #plt.show()
    fg.savefig('./plots/roc' + MODEL_FILE + RUN_NAME +'.pdf')
Beispiel #13
0
def save_hdf_from_nc(fname):
    from deepdish.io import save
    data=conv_nc_to_dict(fname)
    if fname.endswith('.nc'):
        fname=fname[:-3]
    save(fname+'.hdf5',data)
Beispiel #14
0
def makeCombi(inputDir, inputFile, outputDir, makeTrainingInput=False, sys=''):
    print(str(inputDir+"/"+inputFile)+" start")
    chain = TChain("ttbbLepJets/tree")
    chain.Add(inputDir+"/"+inputFile)

    data = False
    if 'Data' in inputDir: data = True
    ttbb = False
    if 'ttbb' in inputDir: ttbb = True
    if makeTrainingInput:  ttbb = True

    muon_ch = 0
    muon_pt = 30.0
    muon_eta = 2.1
    electron_ch = 1
    electron_pt = 35.0
    electron_eta = 2.1
    jet_pt = 30.0
    jet_eta = 2.4
    jet_CSV_tight = 0.9535

    jetCombi = []
    for i in xrange(chain.GetEntries()) :
        chain.GetEntry(i)

        lepton_SF = 1.0
        jet_SF_CSV = 1.0
        pdfweight = []
        scaleweight = []
        PUWeight = []
        lepton_SF = []
        jet_SF_CSV_30 = []
        if not data:
            for j in xrange((chain.lepton_SF).size()):
                lepton_SF.append(float(chain.lepton_SF[j]))
            for j in xrange((chain.jet_SF_CSV_30).size()):
                jet_SF_CSV_30.append(float(chain.jet_SF_CSV_30[j]))
            for j in xrange((chain.PUWeight).size()):
                PUWeight.append(float(chain.PUWeight[j]))
        if 'TT' in inputDir or 'tt' in inputDir:
            for j in xrange((chain.scaleweight).size()):
                scaleweight.append(float(chain.scaleweight[j]))
            for j in xrange((chain.pdfweight).size()):
                pdfweight.append(float(chain.pdfweight[j]))

        MET_px = chain.MET*math.cos(chain.MET_phi)
        MET_py = chain.MET*math.sin(chain.MET_phi)
        nu = TLorentzVector(MET_px, MET_py, 0, chain.MET)

        lep = TLorentzVector()
        lep.SetPtEtaPhiE(chain.lepton_pT, chain.lepton_eta, chain.lepton_phi, chain.lepton_E)
        passmuon = False
        passelectron = False
        passmuon = chain.channel == muon_ch and lep.Pt() > muon_pt and abs(lep.Eta()) < muon_eta
        passelectron = chain.channel == electron_ch and lep.Pt() > electron_pt and abs(lep.Eta()) < electron_eta
        if passmuon == False and passelectron == False:
            continue

        addbjet1 = TLorentzVector(0,0,0,0)
        addbjet2 = TLorentzVector(0,0,0,0)
        if ttbb:
            addbjet1.SetPtEtaPhiE(chain.addbjet1_pt,chain.addbjet1_eta,chain.addbjet1_phi,chain.addbjet1_e)
            addbjet2.SetPtEtaPhiE(chain.addbjet2_pt,chain.addbjet2_eta,chain.addbjet2_phi,chain.addbjet2_e)
        njets = 0
        nbjets = 0
        addbjet1_matched = TLorentzVector(0,0,0,0)
        addbjet2_matched = TLorentzVector(0,0,0,0)

        for iJet in range(len(chain.jet_pT)):
            jet = TLorentzVector()
            jet.SetPtEtaPhiE(chain.jet_pT[iJet],chain.jet_eta[iJet],chain.jet_phi[iJet],chain.jet_E[iJet])

            if not data :
                if   'jecup'   in sys:
                    jet *= chain.jet_JER_Nom[iJet] * chain.jet_JES_Up[iJet]
                elif 'jecdown' in sys:
                    jet *= chain.jet_JER_Nom[iJet] * chain.jet_JES_Down[iJet]
                elif 'jerup'   in sys:
                    jet *= chain.jet_JER_Up[iJet]
                elif 'jerdown' in sys:
                    jet *= chain.jet_JER_Down[iJet]
                else:
                    jet *= chain.jet_JER_Nom[iJet]

            if jet.Pt() < jet_pt or abs(jet.Eta()) > jet_eta: continue
            njets += 1
            if chain.jet_CSV[iJet] > jet_CSV_tight: nbjets += 1

            if addbjet1.DeltaR(jet) < 0.4: addbjet1_matched = jet;
            if addbjet2.DeltaR(jet) < 0.4: addbjet2_matched = jet;

        if njets < 6 or nbjets < 3: continue
        print("addbjet1: "+str(addbjet1.Pt())+" matched: "+str(addbjet1_matched.Pt()))
        print("addbjet2: "+str(addbjet2.Pt())+" matched: "+str(addbjet2_matched.Pt()))

        for j in range(len(chain.jet_pT)-1):
            for k in range(j+1, len(chain.jet_pT)):
                if chain.jet_CSV[j] > jet_CSV_tight and chain.jet_CSV[k] > jet_CSV_tight:
                    b1 = TLorentzVector()
                    b2 = TLorentzVector()
                    b1.SetPtEtaPhiE(chain.jet_pT[j], chain.jet_eta[j], chain.jet_phi[j], chain.jet_E[j])
                    b2.SetPtEtaPhiE(chain.jet_pT[k], chain.jet_eta[k], chain.jet_phi[k], chain.jet_E[k])
                    if not data :
                        if   'jecup'   in sys:
                            b1 *= chain.jet_JER_Nom[j] * chain.jet_JES_Up[j]
                            b2 *= chain.jet_JER_Nom[k] * chain.jet_JES_Up[k]
                        elif 'jecdown' in sys:
                            b1 *= chain.jet_JER_Nom[j] * chain.jet_JES_Down[j]
                            b2 *= chain.jet_JER_Nom[k] * chain.jet_JES_Down[k]
                        elif 'jerup'   in sys:
                            b1 *= chain.jet_JER_Up[j]
                            b2 *= chain.jet_JER_Up[k]
                        elif 'jerdown' in sys:
                            b1 *= chain.jet_JER_Down[j]
                            b2 *= chain.jet_JER_Down[k]
                        else                 :
                            b1 *= chain.jet_JER_Nom[j]
                            b2 *= chain.jet_JER_Nom[k]

                    if makeTrainingInput:
                        if (addbjet1_matched.DeltaR(b1) == 0 and addbjet2_matched.DeltaR(b2) == 0) or (addbjet2_matched.DeltaR(b1) == 0  and addbjet1_matched.DeltaR(b2) == 0):
                            signal = 1
                        else:
                            signal = 0

                        jetCombi.append([
                            signal,i,b1.DeltaR(b2),abs(b1.Eta()-b2.Eta()),b1.DeltaPhi(b2),
                            (b1+b2+nu).Pt(),(b1+b2+nu).Eta(),(b1+b2+nu).Phi(),(b1+b2+nu).M(),
                            (b1+b2+lep).Pt(),(b1+b2+lep).Eta(),(b1+b2+lep).Phi(),(b1+b2+lep).M(),
                            (b1+lep).Pt(),(b1+lep).Eta(),(b1+lep).Phi(),(b1+lep).M(),
                            (b2+lep).Pt(),(b2+lep).Eta(),(b2+lep).Phi(),(b2+lep).M(),
                            (b1+b2).Pt(),(b1+b2).Eta(),(b1+b2).Phi(),(b1+b2).M(),
                            chain.jet_CSV[j],chain.jet_CSV[k],
                            b1.Pt(),b2.Pt(),b1.Eta(),b2.Eta(),b1.Phi(),b2.Phi(),b1.E(),b2.E()
                        ])
                    else:
                        jetCombi.append([
                            #Tree info
                            i, chain.channel, njets, nbjets,
                            chain.genweight, PUWeight,
                            lepton_SF, jet_SF_CSV_30, scaleweight, pdfweight,
                            lep.Pt(), lep.Eta(), lep.Phi(), lep.E(),
                            addbjet1.Pt(), addbjet1.Eta(), addbjet1.Phi(), addbjet1.E(),
                            addbjet2.Pt(), addbjet2.Eta(), addbjet2.Phi(), addbjet2.E(),
                            j,k,
                            #Deep learning variables
                            b1.DeltaR(b2),abs(b1.Eta()-b2.Eta()),b1.DeltaPhi(b2),
                            (b1+b2+nu).Pt(),(b1+b2+nu).Eta(),(b1+b2+nu).Phi(),(b1+b2+nu).M(),
                            (b1+b2+lep).Pt(),(b1+b2+lep).Eta(),(b1+b2+lep).Phi(),(b1+b2+lep).M(),
                            (b1+lep).Pt(),(b1+lep).Eta(),(b1+lep).Phi(),(b1+lep).M(),
                            (b2+lep).Pt(),(b2+lep).Eta(),(b2+lep).Phi(),(b2+lep).M(),
                            (b1+b2).Pt(),(b1+b2).Eta(),(b1+b2).Phi(),(b1+b2).M(),
                            chain.jet_CSV[j],chain.jet_CSV[k],
                            b1.Pt(),b2.Pt(),b1.Eta(),b2.Eta(),b1.Phi(),b2.Phi(),b1.E(),b2.E()
                            ])

    if makeTrainingInput:
        combi = pd.DataFrame(jetCombi, columns=['signal', 'event']+ut.getVarlist())
    else:
        combi = pd.DataFrame(jetCombi, columns=
            ['event','channel','njets','nbjets',
            'genWeight','PUWeight',
            'lepton_SF','jet_SF_CSV_30', 'scaleweight', 'pdfweight',
            'leptonPt','leptonEta','leptonPhi','leptonE',
            'addbjet1_pt','addbjet1_eta','addbjet1_phi','addbjet1_e',
            'addbjet2_pt','addbjet2_eta','addbjet2_phi','addbjet2_e',
            'b1','b2',
            ] + ut.getVarlist())

    tmp = inputFile[:-5]
    if makeTrainingInput:
        io.save(outputDir+"/array_train_ttbb.h5",combi)
    else:
        io.save(outputDir+"/array_"+tmp+".h5",combi)
    print(str(inputDir+"/"+inputFile)+" end")
Beispiel #15
0
def _fit_and_score_ckpt(workdir=None,
                        checkpoint=True,
                        force_refresh=False,
                        **fit_and_score_kwargs):
    """Fit estimator and compute scores for a given dataset split.

    This function wraps
    :func:`sklearn:sklearn.model_selection._validation._fit_and_score`,
    while also saving checkpoint files containing the estimator, paramters,
    This is useful if fitting and scoring is costly or if it is being
    performed within a large cross-validation experiment.

    In avoid collisions with scores computed for other CV splits, this
    function computes a hash from a nested dictionary containing all keyword
    arguments as well as estimator parameters. It then saves the scores and
    parameters in <hash>_params.h5 and the estimator itself in
    <hash>_estimator.pkl

    Parameters
    ----------
    workdir : path-like object, default=None
        A string or :term:`python:path-like-object` indicating the directory
        in which to store checkpoint files

    checkpoint : bool, default=True
        If True, checkpoint the parameters, estimators, and scores.

    force_refresh : bool, default=False
        If True, recompute scores even if the checkpoint file already exists.
        Otherwise, load scores from checkpoint files and return.

    **fit_and_score_kwargs : kwargs
        Key-word arguments passed to
        :func:`sklearn:sklearn.model_selection._validation._fit_and_score`

    Returns
    -------
    train_scores : dict of scorer name -> float
        Score on training set (for all the scorers),
        returned only if `return_train_score` is `True`.

    test_scores : dict of scorer name -> float
        Score on testing set (for all the scorers).

    n_test_samples : int
        Number of test samples.

    fit_time : float
        Time spent for fitting in seconds.

    score_time : float
        Time spent for scoring in seconds.

    parameters : dict or None
        The parameters that have been evaluated.

    estimator : estimator object
        The fitted estimator
    """
    if not checkpoint:
        return _fit_and_score(**fit_and_score_kwargs)

    if workdir is None:
        raise ValueError(
            "If checkpoint is True, you must supply a working directory "
            "through the ``workdir`` argument.")

    estimator = fit_and_score_kwargs.pop("estimator", None)
    estimator_params = _serialize_estimator_params(estimator.get_params())
    all_params = {
        "estimator_params": estimator_params,
        "fit_and_score_kwargs": fit_and_score_kwargs,
    }

    cv_hash = hashlib.md5(
        json.dumps(all_params, sort_keys=True, ensure_ascii=True,
                   default=str).encode()).hexdigest()

    h5_file = os.path.join(workdir, cv_hash + "_params.h5")
    pkl_file = os.path.join(workdir, cv_hash + "_estimator.pkl")

    if not force_refresh and os.path.exists(h5_file):
        ckpt_dict = ddio.load(h5_file)

        scores = ckpt_dict["scores"]

        if fit_and_score_kwargs.get("return_estimator", False):
            with open(pkl_file, "rb") as fp:
                estimator = pickle.load(fp)

            scores.append(estimator)

        return scores
    else:
        scores = _fit_and_score(estimator, **fit_and_score_kwargs)
        os.makedirs(workdir, exist_ok=True)
        if fit_and_score_kwargs.get("return_estimator", False):
            estimator = scores[-1]
            with open(pkl_file, "wb") as fp:
                pickle.dump(estimator, fp)

            ckpt_scores = scores[:-1]
            if isinstance(estimator, Pipeline):
                model = estimator.steps[-1]
            else:
                model = estimator

            estimator_params = _serialize_estimator_params(
                estimator.get_params())
            fitted_params = {
                "alpha_": getattr(model, "alpha_", None),
                "alphas_": getattr(model, "alphas_", None),
                "l1_ratio_": getattr(model, "l1_ratio_", None),
                "mse_path_": getattr(model, "mse_path_", None),
                "scoring_path_": getattr(model, "scoring_path_", None),
                "intercept_": getattr(model, "intercept_", None),
                "coef_": getattr(model, "coef_", None),
            }
        else:
            estimator_params = None
            fitted_params = None
            ckpt_scores = scores

        fit_and_score_kwargs.pop("X")
        fit_and_score_kwargs.pop("y")

        if "scorer" in fit_and_score_kwargs:
            fit_and_score_kwargs["scorer"] = list(
                fit_and_score_kwargs["scorer"].keys())

        ckpt_dict = {
            "scores": ckpt_scores,
            "fit_and_score_kwargs": fit_and_score_kwargs,
            "estimator_params": estimator_params,
            "fitted_params": fitted_params,
        }

        ddio.save(h5_file, ckpt_dict)
        return scores
Beispiel #16
0
def main(embed_size, normed, input_id, run_name):

    configure_logging()
    logger = logging.getLogger("RNNIP Training")

    logger.info("Loading hdf5's")
    test_dict = io.load(os.path.join('data', 'test_dict_' + input_id + '.h5'))
    train_dict = io.load(os.path.join('data',
                                      'train_dict_' + input_id + '.h5'))

    X_train_stream0 = train_dict['grade']
    X_train_stream1 = train_dict['X']
    y_train = train_dict['y']

    X_test_stream0 = test_dict['grade']
    X_test_stream1 = test_dict['X']
    y_test = test_dict['y']

    ip3d = test_dict['ip3d']

    logger.info('Building model')
    model = build_model(X_train_stream0, X_train_stream1, embed_size, normed)
    model.summary()

    logger.info('Compiling model')
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

    #-- if the pre-trained model exists, load it in, otherwise start from scratch
    safe_mkdir('weights')
    weights_file = os.path.join('weights', 'rnnip_' + run_name + '.h5')
    try:
        model.load_weights(weights_file)
        logger.info('Loaded pre-trained model from ' + weights_file)
    except IOError:
        logger.info('No pre-trained model found in ' + weights_file)

    logger.info('Training:')
    try:
        model.fit([X_train_stream0, X_train_stream1],
                  y_train,
                  batch_size=512,
                  callbacks=[
                      EarlyStopping(verbose=True,
                                    patience=20,
                                    monitor='val_loss'),
                      ModelCheckpoint(weights_file,
                                      monitor='val_loss',
                                      verbose=True,
                                      save_best_only=True)
                  ],
                  epochs=300,
                  validation_split=0.2)

    except KeyboardInterrupt:
        logger.info('Training ended early.')

    # -- load in best network
    logger.info('Loading best epoch')
    model.load_weights(weights_file)

    json_string = model.to_json()
    safe_mkdir('json_models')
    open(os.path.join('json_models', run_name + '.json'),
         'w').write(json_string)

    logger.info('Testing')
    safe_mkdir('predictions')
    yhat = model.predict([X_test_stream0, X_test_stream1],
                         verbose=True,
                         batch_size=10000)
    io.save(os.path.join('predictions', 'yhat' + run_name + '.h5'), yhat)

    logger.info('Plotting ROC')
    plot_ROC(y_test, yhat, ip3d, run_name)
                    #'jet_trk_dPhi'] # more to be added in `process_data`
                    'jet_trk_phi'] # more to be added in `process_data`

    cut_vars = ['jet_eta', 'jet_pt', 'jet_JVT', 'jet_aliveAfterOR'] # only necessary to remove bad jets

     # -- load and process training set
    print 'Loading training dataframe...'
    trk_train = pup.root2panda(
        './data/Dan/NOtrkSel/train_NOtrkSel.root', 
        'bTag_AntiKt4EMTopoJets', 
        branches = track_inputs + cut_vars + ['jet_LabDr_HadF' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta']
    )
    print 'Processing training sample ...'
    train_dict = process_data(trk_train, cut_vars, savevars=True)
    del trk_train
    io.save('./data/train_dict_IPConv_ntuple_MyTrkSel.h5', train_dict)

    # -- load and process test set
    print 'Loading test dataframe...'
    trk_test  = pup.root2panda(
        './data/Dan/NOtrkSel/test/user.dguest.8493098.Akt4EMTo._000013_NOtrkSel.root', 
        'bTag_AntiKt4EMTopoJets', 
        branches = track_inputs + cut_vars + ['jet_LabDr_HadF' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_theta']
    )
    print 'Processing test sample...'
    test_dict = process_data(trk_test, cut_vars, savevars=False)
    del trk_test
    io.save('./data/test_dict_IPConv_ntuple_MyTrkSel.h5', test_dict)


    
Beispiel #18
0
                        help="Maximum number of tracks per event. \
        If the event has fewer tracks, use padding; if is has more, only consider the first ntrk"
                        )
    args = parser.parse_args()

    print 'Loading dataframes...'
    # -- currently only training and testing on one file each!
    trk_train = pup.root2panda(os.path.join('data', 'train', args.train_files),
                               'bTag_AntiKt4EMTopoJets',
                               branches=track_inputs + jet_inputs)
    trk_test = pup.root2panda(os.path.join('data', 'test', args.test_files),
                              'bTag_AntiKt4EMTopoJets',
                              branches=track_inputs + jet_inputs)
    print 'Processing training sample ...'
    train_dict = process_data(trk_train,
                              jet_inputs,
                              args.ntrk,
                              args.sort_by,
                              args.output,
                              savevars=True)
    del trk_train
    io.save(os.path.join('data', 'train_dict_' + args.output + '.h5'),
            train_dict)

    print 'Processing test sample...'
    test_dict = process_data(trk_test, jet_inputs, args.ntrk, args.sort_by,
                             args.output)
    del trk_test
    io.save(os.path.join('data', 'test_dict_' + args.output + '.h5'),
            test_dict)
def process(i, filepath, yaml_file):
    '''
    '''
    import pandautils as pup

    branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features(
        yaml_file)
    logger = logging.getLogger("ETL Service")
    logger.info('Operating on {}'.format(filepath))
    logger.info('Creating dataframe...')
    df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches)

    logger.info('Transforming variables...')
    df = transformVars(df)

    logger.info('Flattening df...')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()})
    del df

    logger.info('Applying cuts...')
    df_flat = apply_calojet_cuts(df_flat)

    logger.info('Creating X, y, w, mv2c10...')
    y = df_flat['jet_LabDr_HadF'].values
    mv2c10 = df_flat['jet_mv2c10'].values
    jet_pt = df_flat['jet_pt'].values
    ip3d_vars = df_flat[ip3d_training_vars].values
    ipmp_vars = df_flat[ipmp_training_vars].values
    # -- slice df by only keeping the training variables
    X = df_flat[training_vars].values

    # -- Find weights by reweighting to the light distribution
    # -- TO DO: pass the pt and eta columns directly, instead of passing their indices
    pt_col = np.argwhere(np.array(training_vars) == 'jet_pt')[0][0]
    eta_col = np.argwhere(np.array(training_vars) == 'abs(jet_eta)')[0][0]
    #w = reweight_to_b(X, y, pt_col, eta_col)
    w = reweight_to_l(X, y, pt_col, eta_col)
    del df_flat

    logger.info('Shuffling, splitting, scaling...')
    ix = np.array(range(len(y)))
    X_train, X_test, y_train, y_test, w_train, w_test, ix_train, ix_test, \
    mv2c10_train, mv2c10_test, jet_pt_train, jet_pt_test, \
    ip3d_vars_train, ip3d_vars_test, ipmp_vars_train, ipmp_vars_test = train_test_split(
        X,
        y,
        w,
        ix,
        mv2c10,
        jet_pt,
        ip3d_vars,
        ipmp_vars,
        train_size=0.6)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    ip3d_vars_train = scaler.fit_transform(ip3d_vars_train)
    ip3d_vars_test = scaler.transform(ip3d_vars_test)
    ipmp_vars_train = scaler.fit_transform(ipmp_vars_train)
    ipmp_vars_test = scaler.transform(ipmp_vars_test)

    X_train, X_validate, y_train, y_validate, w_train, w_validate, ix_train, ix_validate, \
    mv2c10_train, mv2c10_validate, jet_pt_train, jet_pt_validate, \
    ip3d_vars_train, ip3d_vars_validate, ipmp_vars_train, ipmp_vars_validate = train_test_split(
        X_train,
        y_train,
        w_train,
        ix_train,
        mv2c10_train,
        jet_pt_train,
        ip3d_vars_train,
        ipmp_vars_train,
        train_size=0.7)

    train = {
        'X': X_train,
        'ip3d_vars': ip3d_vars_train,
        'ipmp_vars': ipmp_vars_train,
        'y': y_train,
        'w': w_train,
        'ix': ix_train,
        'mv2c10': mv2c10_train,
        'pt': jet_pt_train
    }

    test = {
        'X': X_test,
        'ip3d_vars': ip3d_vars_test,
        'ipmp_vars': ipmp_vars_test,
        'y': y_test,
        'w': w_test,
        'ix': ix_test,
        'mv2c10': mv2c10_test,
        'pt': jet_pt_test
    }

    validate = {
        'X': X_validate,
        'ip3d_vars': ip3d_vars_validate,
        'ipmp_vars': ipmp_vars_validate,
        'y': y_validate,
        'w': w_validate,
        'ix': ix_validate,
        'mv2c10': mv2c10_validate,
        'pt': jet_pt_validate
    }

    logger.info('Saving dictionaries to hdf5...')
    hdf5_train_path = os.path.join('..', 'data',
                                   'DL1-' + OUTNAME + str(i) + '-train-db.h5')
    hdf5_test_path = os.path.join('..', 'data',
                                  'DL1-' + OUTNAME + str(i) + '-test-db.h5')
    hdf5_validate_path = os.path.join(
        '..', 'data', 'DL1-' + OUTNAME + str(i) + '-validate-db.h5')

    io.save(hdf5_train_path, train)
    io.save(hdf5_test_path, test)
    io.save(hdf5_validate_path, validate)
    logger.debug('Saved hdf5 archives: {}, {}, {}'.format(
        hdf5_train_path, hdf5_test_path, hdf5_validate_path))

    return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
Beispiel #20
0
 def save(self, fname):
     with warnings.catch_warnings():
         warnings.simplefilter('ignore', category=tables.NaturalNameWarning)
         dio.save(fname, self.to_dict())
Beispiel #21
0
def integrate(prec_i,chi,beta,
              Ps=defaultPs,
              n=(512,512),l=(256.0,256.0),
              Vs_initial="uniform",rhs="oz_EQK",
              bc="periodic",it="pseudo_spectral",
              first_time = 1000.0,tol=1.0e-8,add_noise=0.01,
              fname="cont",verbose=True,
              create_movie=False,
              send_email=None, test=False):
    import deepdish.io as dd
    if send_email is not None:
        import getpass
        pswd = getpass.getpass('Password:'******'rhs':rhs,'n':n,'l':l,'bc':bc,'it':it,
        'dt':0.1,'verbose':verbose,'analyze':False,'setPDE':True}
    if Vs_initial.endswith(".dat",-4):
        state = np.loadtxt(Vs_initial)
        m = bwhModel(Es=Es,Ps=Ps,Vs=state)
        fname = str(Vs_initial[:-4])
    else:
        m = bwhModel(Es=Es,Ps=Ps,Vs=None)
        if Vs_initial=="uniform":
            m.setup_initial_condition("uniform",p=prec_i,chi=chi,beta=beta)
        else:
            m.setup_initial_condition(Vs_initial)
    m.setup['verbose']=verbose
    yr=m.p['conv_T_to_t']
    if test:
        print("Test session:")
        print(m.p)
        print("Dimpar:",m.p['dimpar'])
        print("chi=",chi,"beta=",beta)
        print("fname:",fname,", fname type:",type(fname))
#    sol = Vs_initial
    if not test:
        print("****Starting Integration****")
        sol = m.integrate(m.initial_state,check_convergence=True,
                          max_time=first_time*yr,p=prec_i,chi=chi,beta=beta)
        print("****Integration Finished****")
    elif test:
        sol = m.initial_state
    b,w,h = m.split_state(sol)
    dd.save(fname+".hdf5",{'p':prec_i,'chi':float(chi),'beta':float(beta),
                           'Ps_dimensional':m.p['dimpar'],
                           'n':n,'l':l,'state':sol,'test':test,
                           'b':b,'w':w,'h':h})
    if send_email is not None:
        try:
            import smtplib
            from socket import gaierror
            server = smtplib.SMTP('smtp.gmail.com', 587)
            server.ehlo()
            server.starttls()
            server.login(send_email, pswd)
            msg = "\r\n".join([
                    "From: {}".format(send_email),
                    "To: {}".format(send_email),
                    "Subject: Drought bwh simulations finished",
                    "",
                    "Drought bwh simulations finished and saved to:", fname
                    ])
            server.sendmail(send_email, send_email, msg)
            server.quit()
        except gaierror:
            pass
Beispiel #22
0
def main(file_name, run_name, n_tracks):

    print "Loading hdf5's from ./data/test_dict" + file_name + ".h5 and ./data/train_dict" + file_name + ".h5"
    test_dict = io.load(os.path.join('data', 'test_dict_' + file_name + '.h5'))
    train_dict = io.load(os.path.join('data', 'train_dict_' + file_name + '.h5'))
    
    X_train = train_dict['X']
    y_train = train_dict['y']    

    X_test = test_dict['X']
    y_test = test_dict['y']
    n_features = X_test.shape[2]

    # this is a df
    ip3d = test_dict['ip3d'] 

    print 'Building model...'
    # -- for track grade as a normal input:
    model = Sequential()
    model.add(Masking(mask_value=-999, input_shape=(n_tracks, n_features)))
    model.add(Dropout(0.2)) # dropping out before the GRU should help us reduce dependence on any specific input variable
    # ^^ could be desirable when training on track hits in case one detector layer fails
    model.add(GRU(25, return_sequences=False))
    # model.add(Dropout(0.2))
    model.add(Dense(4))
    model.add(Activation('softmax'))
    model.summary()

    print 'Compiling model...'
    model.compile('adam', 'categorical_crossentropy', metrics=["accuracy"])

    # -- if the pre-trained model exists, load it in, otherwise start from scratch
    try:
        _weights_location = os.path.join('weights', 'ip3d-replacement_' + MODEL_FILE + '_' + run_name +'.h5')
        model.load_weights(_weights_location)
        print 'Loaded pre-trained model from ' + _weights_location
    except IOError:
        print 'No pre-trained model found in ' + _weights_location

    print 'Training:'
    try:
        model.fit(X_train, y_train, batch_size=1024,
            callbacks = [
                EarlyStopping(verbose=True, patience=20, monitor='val_loss'),
                ModelCheckpoint(MODEL_FILE + run_name +'-progress', monitor='val_loss', verbose=True, save_best_only=True)
            ],
        nb_epoch=300, 
        validation_split = 0.2) 
        
    except KeyboardInterrupt:
        print 'Training ended early.'

    # -- load in best network
    model.load_weights(MODEL_FILE + run_name +'-progress')

    print 'Saving weights in ' + _weights_location
    model.save_weights(_weights_location, overwrite=True)

    json_string = model.to_json()
    open(MODEL_FILE + run_name +'.json', 'w').write(json_string)

    print 'Testing...'
    yhat = model.predict(X_test, verbose = True, batch_size = 1024) 
    io.save('yhat'+ run_name +'.h5', yhat) 
     
    print 'Plotting ROC...'
    fg_bl, fg_bc = plot_ROC(y_test, yhat, ip3d, run_name, MODEL_FILE)
Beispiel #23
0
def simdrought(prec_i,prec_f,delta_p,delta_year,chi,beta,
               Ps=defaultPs,
               n=(512,512),l=(256.0,256.0),
               Vs_initial="uniform",rhs="oz_EQK",
               bc="periodic",it="pseudo_spectral",
               first_time = 100.0,tol=1.0e-8,add_noise=0.01,
               fname="cont",verbose=True,
               savefile=None,create_movie=False,
               send_email=None):
    import deepdish.io as dd
    if send_email is not None:
        import getpass
        pswd = getpass.getpass('Password:'******'rhs':rhs,'n':n,'l':l,'bc':bc,'it':it,
        'dt':0.1,'verbose':verbose,'analyze':False,'setPDE':True}
    if type(Vs_initial)==str:
        fname = fname+"_"+Vs_initial
    prec_gradient_down = np.arange(prec_i,prec_f-delta_p,-delta_p)
    time_span = np.arange(delta_year,len(prec_gradient_down)*delta_year+delta_year,delta_year)
    m = bwhModel(Vs=None,Es=Es,Ps=Ps)
    if Vs_initial=="uniform":
        m.setup_initial_condition("uniform",p=prec_i,chi=chi,beta=beta)
    else:
        m.setup_initial_condition(Vs_initial)
    m.setup['verbose']=verbose
    yr=m.p['conv_T_to_t']
    # Converging on the first solution using integration and then root
    Vs_init = m.integrate(m.initial_state,check_convergence=True,
                          max_time=first_time*yr,p=prec_i,chi=chi,beta=beta)
#    Es['rhs']="oz_EQK_relax"
#    m = bwhModel(Vs=Vs_initial,Es=Es,Ps=Ps)
#    m.setup['verbose']=verbose
    Vs = Vs_init.copy()
    b_sol = np.zeros((len(prec_gradient_down),n[0],n[1]))
    w_sol = np.zeros((len(prec_gradient_down),n[0],n[1]))
    h_sol = np.zeros((len(prec_gradient_down),n[0],n[1]))
    if create_movie and savefile is not None:
        savefile_base=savefile
    for i,prec in enumerate(prec_gradient_down):
        print("Integration for p =",prec)
        if create_movie and savefile is not None:
            savefile=savefile_base+"_p{:4.3f}".format(prec).replace(".","_")
        b,w,h=m.split_state(Vs)
        if add_noise is not None:
            b=b+add_noise*np.random.random(size=b.shape)
            w=w+add_noise*np.random.random(size=w.shape)
#            h=h+add_noise*np.random.random(size=h.shape)
        Vs = np.ravel((b,w,h))
        Vs_new=m.integrate(initial_state=Vs,
                           max_time=delta_year*yr,step=yr,
                           check_convergence=False,
                           savefile=savefile,create_movie=False,
                           p=prec,chi=chi,beta=beta)
        if rhs!="oz_EQK":
            if m.converged_relaxation==False:
                time,result=m.pseudo_spectral_integrate(initial_state=Vs,
                                                        finish=delta_year*yr,
                                                        step=yr,
                                                        p=prec,chi=chi,beta=beta)
            Vs_new=result[-1]
        b,w,h=m.split_state(Vs_new)
        b_sol[i]=b
        w_sol[i]=w
        h_sol[i]=h
        Vs = np.ravel((b,w,h))
    dd.save(fname+".hdf5",{'p':prec_gradient_down,"T":time_span,
                           'chi':chi,'beta':beta,
                           'Ps_dimensional':m.p['dimpar'],
                           'n':n,'l':l,
                           'b':b_sol,
                           'w':w_sol,
                           'h':h_sol})
    if send_email is not None:
        try:
            import smtplib
            from socket import gaierror
            server = smtplib.SMTP('smtp.gmail.com', 587)
            server.ehlo()
            server.starttls()
            server.login(send_email, pswd)
            msg = "\r\n".join([
                    "From: {}".format(send_email),
                    "To: {}".format(send_email),
                    "Subject: Drought bwh simulations finished",
                    "",
                    "Drought bwh simulations finished and saved to:", fname
                    ])
            server.sendmail(send_email, send_email, msg)
            server.quit()
        except gaierror:
            pass
Beispiel #24
0
                    'jet_trk_nsplitPixHits', 'jet_trk_nSCTHits',
                    'jet_trk_nsharedSCTHits', 'jet_trk_expectBLayerHit'] # 2 more to be added in `process_data`

    print 'Loading dataframes...'
    # -- currently only training and testing on one file each!
    trk_train = pup.root2panda(
        './data/train/*410000_00*.root', 
        'JetCollection', 
        branches = track_inputs + ['jet_truthflav' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi']
    )

    trk_test  = pup.root2panda(
        './data/test/*410000*.root', 
        'JetCollection', 
        branches = track_inputs + ['jet_truthflav' , 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc', 'jet_phi', 'jet_trk_phi']
    )

    print 'Processing training sample ...'
    train_dict = process_data(trk_train, savevars=True)
    del trk_train
    io.save('./data/train_dict_IPConv.h5', train_dict)

    print 'Processing test sample...'
    test_dict = process_data(trk_test)
    del trk_test
    io.save('./data/test_dict_IPConv.h5', test_dict)




Beispiel #25
0
fig = plt.figure(figsize=(11.69, 8.27), dpi=100)

bins = np.linspace(
    min(min(ttbar['NMuon']), min(qcd['NMuon']), min(wjets['NMuon'])),
    max(max(ttbar['NMuon']), max(qcd['NMuon']), max(wjets['NMuon'])), 10)

_ = plt.hist(
    [ttbar['NMuon'], qcd['NMuon'], wjets['NMuon']],
    stacked=True,
    label=[r'$t\overline{t}$', 'QCD', 'wjets'],
    alpha=0.5,
    histtype='stepfilled',
    normed=False,
    bins=bins,
    weights=[ttbar['EventWeight'], qcd['EventWeight'], wjets['EventWeight']])

plt.xlabel('NMuon')
plt.ylabel('Number of Events')
plt.yscale('log')
plt.legend()
plt.plot()
plt.savefig('task3.pdf')

io.save('wj_nmuons.h5', wjets['NMuon'])
#new_df = io.load('wj_nmuons.h5')
#print new_df

pickle.dump(wjets['NMuon'], open('wj_nmuons.pkl', 'wb'))
#test = pickle.load(open('wj_nmuons.pkl', 'rb'))
#print test
Beispiel #26
0
 def save(self, path):
     d = self.save_to_dict()
     d["name"] = self.name
     io.save(path, d)
Beispiel #27
0
        If the event has fewer tracks, use padding; if is has more, only consider the first ntrk")
    parser.add_argument('--inputs', 
        default='grade', 
        help='one of: hits, grade')
    args = parser.parse_args()

    track_inputs, jet_inputs = generate_inputlist(args.inputs)


    print 'Loading dataframes...'
    # -- currently only training and testing on one file each!
    trk_train = pup.root2panda(
        os.path.join('data', 'train', args.train_files), 
        'bTag_AntiKt4EMTopoJets', 
        branches = track_inputs + jet_inputs
    )
    trk_test  = pup.root2panda(
        os.path.join('data', 'test', args.test_files), 
        'bTag_AntiKt4EMTopoJets', 
        branches = track_inputs + jet_inputs
    )
    print 'Processing training sample ...'
    train_dict = process_data(trk_train, jet_inputs, args.ntrk, args.sort_by, args.output, args.inputs, savevars=True)
    del trk_train
    io.save(os.path.join('data', 'train_dict_' + args.output + '.h5'), train_dict)

    print 'Processing test sample...'
    test_dict = process_data(trk_test, jet_inputs, args.ntrk, args.sort_by, args.output, args.inputs,)
    del trk_test
    io.save(os.path.join('data', 'test_dict_' + args.output + '.h5'), test_dict)
Beispiel #28
0
def main(embed_size, normed, input_id, run_name):

    configure_logging()
    logger = logging.getLogger("RNNIP Training")

    logger.info("Loading hdf5's")
    test_dict = io.load(os.path.join('data', 'test_dict_' + input_id + '.h5'))
    train_dict = io.load(os.path.join('data', 'train_dict_' + input_id + '.h5'))
    
    X_train_stream0 = train_dict['grade']
    X_train_stream1 = train_dict['X']
    y_train = train_dict['y']    

    X_test_stream0 = test_dict['grade']
    X_test_stream1 = test_dict['X']
    y_test = test_dict['y']

    ip3d = test_dict['ip3d'] 

    logger.info('Building model')
    model = build_model(X_train_stream0, X_train_stream1, embed_size, normed)
    model.summary()

    logger.info('Compiling model')
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])

    #-- if the pre-trained model exists, load it in, otherwise start from scratch
    safe_mkdir('weights')
    weights_file = os.path.join('weights', 'rnnip_' + run_name +'.h5')
    try:
        model.load_weights(weights_file)
        logger.info('Loaded pre-trained model from ' + weights_file)
    except IOError:
        logger.info('No pre-trained model found in ' + weights_file)

    logger.info('Training:')
    try:
        model.fit([X_train_stream0, X_train_stream1], y_train, batch_size=512,
            callbacks = [
                EarlyStopping(verbose=True, patience=20, monitor='val_loss'),
                ModelCheckpoint(
                    weights_file, 
                    monitor='val_loss', verbose=True, save_best_only=True
                )
            ],
        epochs=300, 
        validation_split = 0.2) 
        
    except KeyboardInterrupt:
        logger.info('Training ended early.')

    # -- load in best network
    logger.info('Loading best epoch')
    model.load_weights(weights_file)

    json_string = model.to_json()
    safe_mkdir('json_models')
    open(os.path.join('json_models', run_name +'.json'), 'w').write(json_string)

    logger.info('Testing')
    safe_mkdir('predictions')
    yhat = model.predict([X_test_stream0, X_test_stream1], verbose=True, batch_size=10000) 
    io.save(os.path.join('predictions', 'yhat'+ run_name +'.h5'), yhat) 
     
    logger.info('Plotting ROC')
    plot_ROC(y_test, yhat, ip3d, run_name)
Beispiel #29
0
        """
        return (
            self.cells[tup]
            for tup in self.cells
            if self.cells[tup] != -1 and self.cell_distance(cell, tup)
        )

    def update(self, point, index):
        """updates the grid with the new point

        Parameters
        ----------
        point :
            
        index :
            

        Returns
        -------

        """
        self.cells[self.cellify(point)] = index

    def __str__(self):
        return self.cells.__str__()


if __name__ == "__main__":
    bg = 255 - poisson_disk_background((640, 640), 12, 2)
    dio.save("poisson_dense.h5", bg)
def main(inputfiles, treename, ftrain, max_n_pairs, exclude_list):
    '''
    Args:
    -----
        inputfiles: list of strings with the paths to root files
        treename: string, name of the TTree that contains the branches
        ftrain: float in range [0, 1], training fraction
        max_n_pairs: int, maximum number of jet pairs to consider per event
        exclude_list: 
    Returns:
    --------
    '''
    # -- configure logging
    utils.configure_logging()
    logger = logging.getLogger('main')

    # -- concatenate all files into a pandas df
    short_filenames = [f.split('/')[-1] for f in inputfiles]
    logger.info('Creating pandas dataframes from: {}'.format(short_filenames))
    #df = pd.concat([pup.root2panda(f, treename) for f in inputfiles], ignore_index=True)
    df_list = []
    for f in inputfiles:
        df_temp = pup.root2panda(f, treename)
        df_temp['sample'] = f.split('/')[-1].split('.')[0]
        df_list.append(df_temp)
    df = pd.concat(df_list, ignore_index=True)

    # -- remove events with more than one correct jet pair
    # -- because that shouldn't happen and complicates the task
    # -- of finding the correct jet pair
    logger.info('Removing events with more than one correct jet pair')
    keep = np.array([sum(yen) for yen in df['isCorrect'].values]) <= 1
    df = df[keep].reset_index(drop=True)

    # -- target
    logger.info('Building one-hot target')
    y = df['isCorrect'].values

    # -- extract array of names of sample of origin
    sample = df['sample'].values

    # -- prepend 1 to all entries in y where there is no correct jet pair,
    # -- 0 if there exists a correct jet pair already
    # -- each entry in y will now have length (n_jet_pairs + 1)
    y_long = np.array([
        np.insert(yev, 0, 1) if sum(yev) == 0 else np.insert(yev, 0, 0)
        for yev in y
    ])

    # -- weights
    logger.info('Extracting weights from event_weight')
    w = df['event_weight'].values
    del df['event_weight'], df['isCorrect'], df['sample']
    df = df.drop(exclude_list, axis=1)  # maybe in the future do something
    # better with these variables instead of just removing them

    # -- matrix of predictors
    X = df.values
    ix = range(X.shape[0])
    varlist = df.columns.values.tolist()

    # -- maximum number of jet pairs to consider in each event
    # -- can be set to whatever number makes sense
    #max_length = max([len(b) for b in df['Delta_eta_jb']]) + 1
    max_length = max_n_pairs + 1
    logger.info(
        'The max number of jet pairs per event will be {}'.format(max_n_pairs))

    X_train, X_test, y_train, y_test, w_train, w_test,\
    sample_train, sample_test, ix_train, ix_test, scaler_list = shuffle_split_scale_pad(
        X, y_long, w, sample, ix, ftrain, max_length
    )

    logger.info('Saving processed data as hdf5 in data/')
    io.save(
        os.path.join('data', 'train_dict.hdf5'), {
            'X': X_train,
            'y': y_train,
            'w': w_train,
            'ix': ix_train,
            'vars': varlist,
            'sample': sample_train.tolist(),
            'scalers': scaler_list
        })

    io.save(
        os.path.join('data', 'test_dict.hdf5'), {
            'X': X_test,
            'y': y_test,
            'w': w_test,
            'ix': ix_test,
            'vars': varlist,
            'sample': sample_test.tolist(),
            'scalers': scaler_list
        })
Beispiel #31
0
        'jet_trk_expectBLayerHit'
    ]  # 2 more to be added in `process_data`

    print 'Loading dataframes...'
    # -- currently only training and testing on one file each!
    trk_train = pup.root2panda(
        './data/train/*410000_00*.root',
        'JetCollection',
        branches=track_inputs + [
            'jet_truthflav', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc',
            'jet_phi', 'jet_trk_phi'
        ])

    trk_test = pup.root2panda(
        './data/test/*410000*.root',
        'JetCollection',
        branches=track_inputs + [
            'jet_truthflav', 'jet_ip3d_pu', 'jet_ip3d_pb', 'jet_ip3d_pc',
            'jet_phi', 'jet_trk_phi'
        ])

    print 'Processing training sample ...'
    train_dict = process_data(trk_train, savevars=True)
    del trk_train
    io.save('./data/train_dict_IPConv.h5', train_dict)

    print 'Processing test sample...'
    test_dict = process_data(trk_test)
    del trk_test
    io.save('./data/test_dict_IPConv.h5', test_dict)
Beispiel #32
0
    def write_current_state(self):
        """
        Write the current state of the sampler to disk.

        The required information to reconstruct the state of the run are written
        to an hdf5 file.
        All but the most recent removed live point in the chain are removed from
        the sampler to reduce memory usage.
        This means it is necessary to not append the first live point to the
        file if updating a previous checkpoint.

        Parameters
        ----------
        sampler: `dynesty.NestedSampler`
            NestedSampler to write to disk.
        """
        check_directory_exists_and_if_not_mkdir(self.outdir)
        resume_file = '{}/{}_resume.h5'.format(self.outdir, self.label)

        if os.path.isfile(resume_file):
            saved = load(resume_file)

            current_state = dict(
                unit_cube_samples=np.vstack(
                    [saved['unit_cube_samples'], self.sampler.saved_u[1:]]),
                physical_samples=np.vstack(
                    [saved['physical_samples'], self.sampler.saved_v[1:]]),
                sample_likelihoods=np.concatenate(
                    [saved['sample_likelihoods'],
                     self.sampler.saved_logl[1:]]),
                sample_log_volume=np.concatenate([
                    saved['sample_log_volume'], self.sampler.saved_logvol[1:]
                ]),
                sample_log_weights=np.concatenate([
                    saved['sample_log_weights'], self.sampler.saved_logwt[1:]
                ]),
                cumulative_log_evidence=np.concatenate([
                    saved['cumulative_log_evidence'],
                    self.sampler.saved_logz[1:]
                ]),
                cumulative_log_evidence_error=np.concatenate([
                    saved['cumulative_log_evidence_error'],
                    self.sampler.saved_logzvar[1:]
                ]),
                cumulative_information=np.concatenate([
                    saved['cumulative_information'], self.sampler.saved_h[1:]
                ]),
                id=np.concatenate([saved['id'], self.sampler.saved_id[1:]]),
                it=np.concatenate([saved['it'], self.sampler.saved_it[1:]]),
                nc=np.concatenate([saved['nc'], self.sampler.saved_nc[1:]]),
                boundidx=np.concatenate(
                    [saved['boundidx'], self.sampler.saved_boundidx[1:]]),
                bounditer=np.concatenate(
                    [saved['bounditer'], self.sampler.saved_bounditer[1:]]),
                scale=np.concatenate(
                    [saved['scale'], self.sampler.saved_scale[1:]]),
            )

        else:
            current_state = dict(
                unit_cube_samples=self.sampler.saved_u,
                physical_samples=self.sampler.saved_v,
                sample_likelihoods=self.sampler.saved_logl,
                sample_log_volume=self.sampler.saved_logvol,
                sample_log_weights=self.sampler.saved_logwt,
                cumulative_log_evidence=self.sampler.saved_logz,
                cumulative_log_evidence_error=self.sampler.saved_logzvar,
                cumulative_information=self.sampler.saved_h,
                id=self.sampler.saved_id,
                it=self.sampler.saved_it,
                nc=self.sampler.saved_nc,
                boundidx=self.sampler.saved_boundidx,
                bounditer=self.sampler.saved_bounditer,
                scale=self.sampler.saved_scale,
            )

        current_state.update(ncall=self.sampler.ncall,
                             live_logl=self.sampler.live_logl,
                             iteration=self.sampler.it - 1,
                             live_u=self.sampler.live_u,
                             live_v=self.sampler.live_v,
                             nlive=self.sampler.nlive,
                             live_bound=self.sampler.live_bound,
                             live_it=self.sampler.live_it,
                             added_live=self.sampler.added_live)

        weights = np.exp(current_state['sample_log_weights'] -
                         current_state['cumulative_log_evidence'][-1])
        current_state[
            'posterior'] = self.external_sampler.utils.resample_equal(
                np.array(current_state['physical_samples']), weights)

        save(resume_file, current_state)

        self.sampler.saved_id = [self.sampler.saved_id[-1]]
        self.sampler.saved_u = [self.sampler.saved_u[-1]]
        self.sampler.saved_v = [self.sampler.saved_v[-1]]
        self.sampler.saved_logl = [self.sampler.saved_logl[-1]]
        self.sampler.saved_logvol = [self.sampler.saved_logvol[-1]]
        self.sampler.saved_logwt = [self.sampler.saved_logwt[-1]]
        self.sampler.saved_logz = [self.sampler.saved_logz[-1]]
        self.sampler.saved_logzvar = [self.sampler.saved_logzvar[-1]]
        self.sampler.saved_h = [self.sampler.saved_h[-1]]
        self.sampler.saved_nc = [self.sampler.saved_nc[-1]]
        self.sampler.saved_boundidx = [self.sampler.saved_boundidx[-1]]
        self.sampler.saved_it = [self.sampler.saved_it[-1]]
        self.sampler.saved_bounditer = [self.sampler.saved_bounditer[-1]]
        self.sampler.saved_scale = [self.sampler.saved_scale[-1]]
def process(i, filepath, yaml_file, model_id): 
    '''
    '''   
    import pandautils as pup

    # -- load branches from yaml file
    branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features(yaml_file)
    logger = logging.getLogger("ETL Service")

    # -- load root file to dataframe
    logger.info('Operating on {}'.format(filepath))
    logger.info('Creating dataframe...')
    df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches)

    # -- create MV2 input quantities, set default values
    logger.info('Transforming variables...')
    df = transformVars(df)

    # -- flatten to jet-flat structure
    logger.info('Flattening df...')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()})
    del df

    # --apply standard cuts on AntiKT4EMTopoJets
    logger.info('Applying cuts...')
    df_flat = apply_calojet_cuts(df_flat)

    # -- create numpy arrays for ML
    logger.info('Creating X, y, w, mv2c10...')
    y = df_flat['jet_LabDr_HadF'].values
    mv2c10 = df_flat['jet_mv2c10'].values
    jet_pt = df_flat['jet_pt'].values
    ip3d_vars = df_flat[ip3d_training_vars].values
    ipmp_vars = df_flat[ipmp_training_vars].values

    # -- slice df by only keeping the training variables
    X = df_flat[training_vars].values

    # -- Find weights by reweighting to the light distribution
    pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values
    w = reweight_to_l(pteta, y, pt_col=0, eta_col=1)
    del df_flat, pteta

    # -- shuffle data, split into train and test
    logger.info('Shuffling, splitting, scaling...')
    ix = np.array(range(len(y)))
    X_train, X_test,\
    y_train, y_test,\
    w_train, w_test,\
    ix_train, ix_test, \
    mv2c10_train, mv2c10_test,\
    jet_pt_train, jet_pt_test,\
    ip3d_vars_train, ip3d_vars_test,\
    ipmp_vars_train, ipmp_vars_test = train_test_split(
        X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6
    )

    # -- scale inputs to 0 mean, 1 std
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    ip3d_vars_train = scaler.fit_transform(ip3d_vars_train)
    ip3d_vars_test = scaler.transform(ip3d_vars_test)
    ipmp_vars_train = scaler.fit_transform(ipmp_vars_train)
    ipmp_vars_test = scaler.transform(ipmp_vars_test)

    # -- split the previously selected training data into train and validate
    X_train, X_validate,\
    y_train, y_validate,\
    w_train, w_validate,\
    ix_train, ix_validate,\
    mv2c10_train, mv2c10_validate,\
    jet_pt_train, jet_pt_validate,\
    ip3d_vars_train, ip3d_vars_validate,\
    ipmp_vars_train, ipmp_vars_validate = train_test_split(
        X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7
    )

    # -- assign train, test, validate data to dictionaries
    train = {
        'X' : X_train,
        'ip3d_vars': ip3d_vars_train,
        'ipmp_vars': ipmp_vars_train,
        'y' : y_train,
        'w' : w_train,
        'ix': ix_train,
        'mv2c10': mv2c10_train,
        'pt': jet_pt_train
    }

    test = {
        'X' : X_test,
        'ip3d_vars': ip3d_vars_test,
        'ipmp_vars': ipmp_vars_test,
        'y' : y_test,
        'w' : w_test,
        'ix': ix_test,
        'mv2c10': mv2c10_test,
        'pt': jet_pt_test
    }

    validate = {
        'X' : X_validate,
        'ip3d_vars': ip3d_vars_validate,
        'ipmp_vars': ipmp_vars_validate,
        'y' : y_validate,
        'w' : w_validate,
        'ix': ix_validate,
        'mv2c10': mv2c10_validate,
        'pt': jet_pt_validate
    }

    # -- save dictionaries to hdf5
    logger.info('Saving dictionaries to hdf5...')
    hdf5_train_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-train-db.h5')
    hdf5_test_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-test-db.h5')
    hdf5_validate_path = os.path.join('..', 'data', 'DL1-' + model_id + str(i) +'-validate-db.h5')

    io.save(hdf5_train_path, train)
    io.save(hdf5_test_path, test)
    io.save(hdf5_validate_path, validate)
    logger.debug('Saved hdf5 archives: {}, {}, {}'. format(hdf5_train_path, hdf5_test_path, hdf5_validate_path))

    return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
Beispiel #34
0
def process(i, filepath, yaml_file, model_id):
    '''
    '''
    import pandautils as pup

    # -- load branches from yaml file
    branches, training_vars, ip3d_training_vars, ipmp_training_vars = set_features(
        yaml_file)
    logger = logging.getLogger("ETL Service")

    # -- load root file to dataframe
    logger.info('Operating on {}'.format(filepath))
    logger.info('Creating dataframe...')
    df = pup.root2panda(filepath, 'bTag_AntiKt4EMTopoJets', branches=branches)

    # -- create MV2 input quantities, set default values
    logger.info('Transforming variables...')
    df = transformVars(df)

    # -- flatten to jet-flat structure
    logger.info('Flattening df...')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iteritems()})
    del df

    # --apply standard cuts on AntiKT4EMTopoJets
    logger.info('Applying cuts...')
    df_flat = apply_calojet_cuts(df_flat)

    # -- create numpy arrays for ML
    logger.info('Creating X, y, w, mv2c10...')
    y = df_flat['jet_LabDr_HadF'].values
    mv2c10 = df_flat['jet_mv2c10'].values
    jet_pt = df_flat['jet_pt'].values
    ip3d_vars = df_flat[ip3d_training_vars].values
    ipmp_vars = df_flat[ipmp_training_vars].values

    # -- slice df by only keeping the training variables
    X = df_flat[training_vars].values

    # -- Find weights by reweighting to the light distribution
    pteta = df_flat[['jet_pt', 'abs(jet_eta)']].values
    w = reweight_to_l(pteta, y, pt_col=0, eta_col=1)
    del df_flat, pteta

    # -- shuffle data, split into train and test
    logger.info('Shuffling, splitting, scaling...')
    ix = np.array(range(len(y)))
    X_train, X_test,\
    y_train, y_test,\
    w_train, w_test,\
    ix_train, ix_test, \
    mv2c10_train, mv2c10_test,\
    jet_pt_train, jet_pt_test,\
    ip3d_vars_train, ip3d_vars_test,\
    ipmp_vars_train, ipmp_vars_test = train_test_split(
        X, y, w, ix, mv2c10, jet_pt, ip3d_vars, ipmp_vars, train_size=0.6
    )

    # -- scale inputs to 0 mean, 1 std
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    ip3d_vars_train = scaler.fit_transform(ip3d_vars_train)
    ip3d_vars_test = scaler.transform(ip3d_vars_test)
    ipmp_vars_train = scaler.fit_transform(ipmp_vars_train)
    ipmp_vars_test = scaler.transform(ipmp_vars_test)

    # -- split the previously selected training data into train and validate
    X_train, X_validate,\
    y_train, y_validate,\
    w_train, w_validate,\
    ix_train, ix_validate,\
    mv2c10_train, mv2c10_validate,\
    jet_pt_train, jet_pt_validate,\
    ip3d_vars_train, ip3d_vars_validate,\
    ipmp_vars_train, ipmp_vars_validate = train_test_split(
        X_train, y_train, w_train, ix_train, mv2c10_train, jet_pt_train, ip3d_vars_train, ipmp_vars_train, train_size=0.7
    )

    # -- assign train, test, validate data to dictionaries
    train = {
        'X': X_train,
        'ip3d_vars': ip3d_vars_train,
        'ipmp_vars': ipmp_vars_train,
        'y': y_train,
        'w': w_train,
        'ix': ix_train,
        'mv2c10': mv2c10_train,
        'pt': jet_pt_train
    }

    test = {
        'X': X_test,
        'ip3d_vars': ip3d_vars_test,
        'ipmp_vars': ipmp_vars_test,
        'y': y_test,
        'w': w_test,
        'ix': ix_test,
        'mv2c10': mv2c10_test,
        'pt': jet_pt_test
    }

    validate = {
        'X': X_validate,
        'ip3d_vars': ip3d_vars_validate,
        'ipmp_vars': ipmp_vars_validate,
        'y': y_validate,
        'w': w_validate,
        'ix': ix_validate,
        'mv2c10': mv2c10_validate,
        'pt': jet_pt_validate
    }

    # -- save dictionaries to hdf5
    logger.info('Saving dictionaries to hdf5...')
    hdf5_train_path = os.path.join('..', 'data',
                                   'DL1-' + model_id + str(i) + '-train-db.h5')
    hdf5_test_path = os.path.join('..', 'data',
                                  'DL1-' + model_id + str(i) + '-test-db.h5')
    hdf5_validate_path = os.path.join(
        '..', 'data', 'DL1-' + model_id + str(i) + '-validate-db.h5')

    io.save(hdf5_train_path, train)
    io.save(hdf5_test_path, test)
    io.save(hdf5_validate_path, validate)
    logger.debug('Saved hdf5 archives: {}, {}, {}'.format(
        hdf5_train_path, hdf5_test_path, hdf5_validate_path))

    return (y_train.shape[0], y_test.shape[0], y_validate.shape[0])
Beispiel #35
0
def main(inputfiles, treename='bTag_AntiKt2PV0TrackJets'):

    configure_logging()
    logger = logging.getLogger('ProcessTrackJetData')

    # -- import root files into df
    logger.info('Importing ROOT files into pandas dataframes')
    df = pup.root2panda(
        inputfiles,
        treename,
        branches=[
            'jet_pt', 'jet_eta', 'jet_phi', 'jet_m', 'jet_ip2d_pu',
            'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc',
            'jet_ip3d_pb', 'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z',
            'jet_sv1_ntrkv', 'jet_sv1_m', 'jet_sv1_efc', 'jet_sv1_n2t',
            'jet_sv1_sig3d', 'jet_jf_n2t', 'jet_jf_ntrkAtVx', 'jet_jf_nvtx',
            'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_sig3d',
            'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz',
            'jet_aliveAfterOR', 'jet_aliveAfterORmu', 'jet_nConst',
            'jet_LabDr_HadF'
        ])

    # -- Insert default values, calculate MV2 variables from the branches in df
    logger.info('Creating MV2 variables')
    df = transformVars(df)

    # -- Flatten from event-flat to jet-flat
    # -- Before doing so, remove event-level variables such as PVx,y,z
    logger.info('Flattening dataframe')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()})

    # -- apply eta, pt, OR cuts from b-tagging recommendations
    logger.info('Applying cuts')
    df_flat = applycuts(df_flat)

    # -- build X, y, w
    # -- target values
    y = df_flat['jet_LabDr_HadF'].values

    # -- slice df by only keeping the 24 variables for MV2 training
    training_vars = [
        'jet_pt', 'abs(jet_eta)', 'jet_ip2', 'jet_ip2_c', 'jet_ip2_cu',
        'jet_ip3', 'jet_ip3_c', 'jet_ip3_cu', 'jet_sv1_ntrkv', 'jet_sv1_m',
        'jet_sv1_efc', 'jet_sv1_n2t', 'jet_sv1_Lxy', 'jet_sv1_L3d',
        'jet_sv1_sig3d', 'jet_sv1_dR', 'jet_jf_n2t', 'jet_jf_ntrkAtVx',
        'jet_jf_nvtx', 'jet_jf_nvtx1t', 'jet_jf_m', 'jet_jf_efc', 'jet_jf_dR',
        'jet_jf_sig3d'
    ]
    X = df_flat[training_vars].as_matrix()
    logger.info(
        '2D pT and eta reweighting of charm and light to bottom distribution')
    w = reweight_to_b(X, y)

    X, y, w = remove_tau(X, y, w)

    # -- turn classes 0, 4, 5, 15 to 0, 1, 2, 3
    # le = LabelEncoder()
    # y = le.fit_transform(y)

    # -- randomly shuffle and split into train and test set
    logger.info('Shuffling and splitting')
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
        X, y, w, train_size=0.6)

    # -- save out to hdf5
    logger.info('Saving data to hdf5')
    io.save(open('train_data.h5', 'wb'), {
        'X': X_train,
        'y': y_train,
        'w': w_train
    })
    io.save(open('test_data.h5', 'wb'), {
        'X': X_test,
        'y': y_test,
        'w': w_test
    })
Beispiel #36
0
 def save(self, path):
     d = self.save_to_dict()
     d['name'] = self.name
     io.save(path, d)
Beispiel #37
0
#use : $python n2a.py /path/[ntupleName].root
import numpy as np
from numpy.lib.recfunctions import stack_arrays
from ROOT import *
from root_numpy import tree2array
import glob
import pandas as pd
import deepdish.io as io

import sys

arg = sys.argv[1] #arg = ntuple.root

input_ntuple = TFile.Open(arg)
input_ntuple_tree = input_ntuple.Get('dnn_input')
input_ntuple_array = tree2array(input_ntuple_tree)
input_ntuple_df = pd.DataFrame(input_ntuple_array)

#a = [ntuple].root (delete path)
for a in arg.split('/'): continue
a = a.replace("ana","") #a = [ntupleName].root
a = a.replace(".root", ".h5") #a = [ntupleName].h5

#saved as [ntupleName].h5
io.save(a, input_ntuple_df)
def main(inputfiles, treename='bTag_AntiKt2PV0TrackJets'):

    configure_logging()
    logger = logging.getLogger('ProcessTrackJetData')

    # -- import root files into df
    logger.info('Importing ROOT files into pandas dataframes')
    df = pup.root2panda(inputfiles, treename, branches = [
            'jet_pt', 'jet_eta','jet_phi', 'jet_m', 'jet_ip2d_pu', 
            'jet_ip2d_pc', 'jet_ip2d_pb', 'jet_ip3d_pu', 'jet_ip3d_pc','jet_ip3d_pb',
            'jet_sv1_vtx_x', 'jet_sv1_vtx_y', 'jet_sv1_vtx_z', 'jet_sv1_ntrkv',
            'jet_sv1_m','jet_sv1_efc','jet_sv1_n2t','jet_sv1_sig3d',
            'jet_jf_n2t','jet_jf_ntrkAtVx','jet_jf_nvtx','jet_jf_nvtx1t','jet_jf_m',
            'jet_jf_efc','jet_jf_sig3d', 'jet_jf_deta', 'jet_jf_dphi', 'PVx', 'PVy', 'PVz',
            'jet_aliveAfterOR', 'jet_aliveAfterORmu', 'jet_nConst', 'jet_LabDr_HadF'])

    # -- Insert default values, calculate MV2 variables from the branches in df
    logger.info('Creating MV2 variables')
    df = transformVars(df)

    # -- Flatten from event-flat to jet-flat
    # -- Before doing so, remove event-level variables such as PVx,y,z
    logger.info('Flattening dataframe')
    df.drop(['PVx', 'PVy', 'PVz'], axis=1, inplace=True)
    df_flat = pd.DataFrame({k: pup.flatten(c) for k, c in df.iterkv()})

    # -- apply eta, pt, OR cuts from b-tagging recommendations
    logger.info('Applying cuts')
    df_flat = applycuts(df_flat)

    # -- build X, y, w
    # -- target values
    y = df_flat['jet_LabDr_HadF'].values

    # -- slice df by only keeping the 24 variables for MV2 training
    training_vars = [
        'jet_pt', 
        'abs(jet_eta)', 
        'jet_ip2',
        'jet_ip2_c',
        'jet_ip2_cu',
        'jet_ip3',
        'jet_ip3_c',
        'jet_ip3_cu',
        'jet_sv1_ntrkv',
        'jet_sv1_m',
        'jet_sv1_efc',
        'jet_sv1_n2t',
        'jet_sv1_Lxy',
        'jet_sv1_L3d',
        'jet_sv1_sig3d',
        'jet_sv1_dR',
        'jet_jf_n2t',
        'jet_jf_ntrkAtVx',
        'jet_jf_nvtx',
        'jet_jf_nvtx1t',
        'jet_jf_m',
        'jet_jf_efc',
        'jet_jf_dR',
        'jet_jf_sig3d'] 
    X = df_flat[training_vars].as_matrix()
    logger.info('2D pT and eta reweighting of charm and light to bottom distribution')
    w = reweight_to_b(X, y)

    X, y, w = remove_tau(X, y, w)

    # -- turn classes 0, 4, 5, 15 to 0, 1, 2, 3
    # le = LabelEncoder()
    # y = le.fit_transform(y)

    # -- randomly shuffle and split into train and test set
    logger.info('Shuffling and splitting')
    X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(X, y, w, train_size = 0.6)

    # -- save out to hdf5
    logger.info('Saving data to hdf5')
    io.save(open('train_data.h5', 'wb'), {'X' : X_train, 'y' : y_train, 'w' : w_train})
    io.save(open('test_data.h5', 'wb'), {'X' : X_test, 'y' : y_test, 'w' : w_test})
Beispiel #39
0
import matplotlib
import matplotlib.pyplot as plt
import cPickle as pickle
import deepdish.io as io

ttbar = root2array('ttbar.root')
qcd = root2array('qcd.root')
wjets = root2array('wjets.root')

matplotlib.rcParams.update({'font.size': 16})
fig = plt.figure(figsize=(11.69, 8.27), dpi=100)

bins = np.linspace(min(min(ttbar['NMuon']), min(qcd['NMuon']), min(wjets['NMuon'])), max(max(ttbar['NMuon']), max(qcd['NMuon']), max(wjets['NMuon'])), 10)

_ = plt.hist([ttbar['NMuon'], qcd['NMuon'], wjets['NMuon']], stacked=True, label=[r'$t\overline{t}$', 'QCD', 'wjets'], alpha = 0.5, histtype='stepfilled', normed=False, bins=bins, weights=[ttbar['EventWeight'], qcd['EventWeight'], wjets['EventWeight']])

plt.xlabel('NMuon')
plt.ylabel('Number of Events')
plt.yscale('log')
plt.legend()
plt.plot()
plt.savefig('task3.pdf')

io.save('wj_nmuons.h5', wjets['NMuon'])
#new_df = io.load('wj_nmuons.h5')
#print new_df

pickle.dump(wjets['NMuon'], open('wj_nmuons.pkl', 'wb'))
#test = pickle.load(open('wj_nmuons.pkl', 'rb'))
#print test