예제 #1
0
def fit_mass(data,
             column,
             x,
             sig_pdf=None,
             bkg_pdf=None,
             n_sig=None,
             n_bkg=None,
             blind=False,
             nll_profile=False,
             second_storage=None,
             log_plot=False,
             pulls=True,
             sPlot=False,
             bkg_in_region=False,
             importance=3,
             plot_importance=3):
    """Fit a given pdf to a variable distribution


    Parameter
    ---------
    data : |hepds_type|
        The data containing the variable to fit to
    column : str
        The name of the column to fit the pdf to
    sig_pdf : RooFit pdf
        The signal Probability Density Function. The variable to fit to has
        to be named 'x'.
    bkg_pdf : RooFit pdf
        The background Probability Density Function. The variable to fit to has
        to be named 'x'.
    n_sig : None or numeric
        The number of signals in the data. If it should be fitted, use None.
    n_bkg : None or numeric
        The number of background events in the data.
        If it should be fitted, use None.
    blind : boolean or tuple(numberic, numberic)
        If False, the data is fitted. If a tuple is provided, the values are
        used as the lower (the first value) and the upper (the second value)
        limit of a blinding region, which will be omitted in plots.
        Additionally, no true number of signal will be returned but only fake.
    nll_profile : boolean
        If True, a Negative Log-Likelihood Profile will be generated. Does not
        work with blind fits.
    second_storage : |hepds_type|
        A second data-storage that will be concatenated with the first one.
    importance : |importance_type|
        |importance_docstring|
    plot_importance : |plot_importance_type|
        |plot_importance_docstring|

    Return
    ------
    tuple(numerical, numerical)
        Return the number of signals and the number of backgrounds in the
        signal-region. If a blind fit is performed, the signal will be a fake
        number. If no number of background events is required, -999 will be
        returned.
    """

    if not (isinstance(column, str) or len(column) == 1):
        raise ValueError("Fitting to several columns " + str(column) +
                         " not supported.")
    if type(sig_pdf) == type(bkg_pdf) == None:
        raise ValueError("sig_pdf and bkg_pdf are both None-> no fit possible")
    if blind is not False:
        lower_blind, upper_blind = blind
        blind = True

    n_bkg_below_sig = -999
    # create data
    data_name = data.name
    data_array, _t1, _t2 = data.make_dataset(second_storage, columns=column)
    del _t1, _t2

    # double crystalball variables
    min_x, max_x = min(data_array[column]), max(data_array[column])

    #    x = RooRealVar("x", "x variable", min_x, max_x)

    # create data
    data_array = np.array([i[0] for i in data_array.as_matrix()])
    data_array.dtype = [('x', np.float64)]
    tree1 = array2tree(data_array, "x")
    data = RooDataSet("data", "Data", RooArgSet(x), RooFit.Import(tree1))

    #    # TODO: export somewhere? does not need to be defined inside...
    #    mean = RooRealVar("mean", "Mean of Double CB PDF", 5280, 5100, 5600)#, 5300, 5500)
    #    sigma = RooRealVar("sigma", "Sigma of Double CB PDF", 40, 0.001, 200)
    #    alpha_0 = RooRealVar("alpha_0", "alpha_0 of one side", 5.715)#, 0, 150)
    #    alpha_1 = RooRealVar("alpha_1", "alpha_1 of other side", -4.019)#, -200, 0.)
    #    lambda_0 = RooRealVar("lambda_0", "Exponent of one side", 3.42)#, 0, 150)
    #    lambda_1 = RooRealVar("lambda_1", "Exponent of other side", 3.7914)#, 0, 500)
    #
    #    # TODO: export somewhere? pdf construction
    #    frac = RooRealVar("frac", "Fraction of crystal ball pdfs", 0.479, 0.01, 0.99)
    #
    #    crystalball1 = RooCBShape("crystallball1", "First CrystalBall PDF", x,
    #                              mean, sigma, alpha_0, lambda_0)
    #    crystalball2 = RooCBShape("crystallball2", "Second CrystalBall PDF", x,
    #                              mean, sigma, alpha_1, lambda_1)
    #    doubleCB = RooAddPdf("doubleCB", "Double CrystalBall PDF",
    #                         crystalball1, crystalball2, frac)

    #    n_sig = RooRealVar("n_sig", "Number of signals events", 10000, 0, 1000000)

    # test input
    if n_sig == n_bkg == 0:
        raise ValueError("n_sig as well as n_bkg is 0...")

    if n_bkg is None:
        n_bkg = RooRealVar("n_bkg", "Number of background events", 10000, 0,
                           500000)
    elif n_bkg >= 0:
        n_bkg = RooRealVar("n_bkg", "Number of background events", int(n_bkg))
    else:
        raise ValueError("n_bkg is not >= 0 or None")

    if n_sig is None:
        n_sig = RooRealVar("n_sig", "Number of signal events", 1050, 0, 200000)

        # START BLINDING
        blind_cat = RooCategory("blind_cat", "blind state category")
        blind_cat.defineType("unblind", 0)
        blind_cat.defineType("blind", 1)
        if blind:
            blind_cat.setLabel("blind")
            blind_n_sig = RooUnblindPrecision("blind_n_sig",
                                              "blind number of signals",
                                              "wasistdas", n_sig.getVal(),
                                              10000, n_sig, blind_cat)
        else:
            #            blind_cat.setLabel("unblind")
            blind_n_sig = n_sig

        print "n_sig value " + str(n_sig.getVal())
#        raw_input("blind value " + str(blind_n_sig.getVal()))

#        n_sig = blind_n_sig

# END BLINDING
    elif n_sig >= 0:
        n_sig = RooRealVar("n_sig", "Number of signal events", int(n_sig))
    else:
        raise ValueError("n_sig is not >= 0")

#    if not blind:
#        blind_n_sig = n_sig

#    # create bkg-pdf
#    lambda_exp = RooRealVar("lambda_exp", "lambda exp pdf bkg", -0.00025, -1., 1.)
#    bkg_pdf = RooExponential("bkg_pdf", "Background PDF exp", x, lambda_exp)

    if blind:
        comb_pdf = RooAddPdf("comb_pdf", "Combined DoubleCB and bkg PDF",
                             RooArgList(sig_pdf, bkg_pdf),
                             RooArgList(blind_n_sig, n_bkg))
    else:
        comb_pdf = RooAddPdf("comb_pdf", "Combined DoubleCB and bkg PDF",
                             RooArgList(sig_pdf, bkg_pdf),
                             RooArgList(n_sig, n_bkg))

    # create test dataset
#    mean_gauss = RooRealVar("mean_gauss", "Mean of Gaussian", 5553, -10000, 10000)
#    sigma_gauss = RooRealVar("sigma_gauss", "Width of Gaussian", 20, 0.0001, 300)
#    gauss1 = RooGaussian("gauss1", "Gaussian test dist", x, mean_gauss, sigma_gauss)
#    lambda_data = RooRealVar("lambda_data", "lambda exp data", -.002)
#    exp_data = RooExponential("exp_data", "data example exp", x, lambda_data)
#    frac_data = RooRealVar("frac_data", "Fraction PDF of data", 0.15)
#
#    data_pdf = RooAddPdf("data_pdf", "Data PDF", gauss1, exp_data, frac_data)
#    data = data_pdf.generate(RooArgSet(x), 30000)

#    data.printValue()
#    xframe = x.frame()
#    data_pdf.plotOn(xframe)
#    print "n_cpu:", meta_config.get_n_cpu()
#    input("test")
#    comb_pdf.fitTo(data, RooFit.Extended(ROOT.kTRUE), RooFit.NumCPU(meta_config.get_n_cpu()))
#     HACK to get 8 cores in testing
    c5 = TCanvas("c5", "RooFit pdf not fit vs " + data_name)
    c5.cd()
    x_frame1 = x.frame()
    #    data.plotOn(x_frame1)
    #    comb_pdf.pdfList()[1].plotOn(x_frame1)

    if __name__ == "__main__":
        n_cpu = 8
    else:
        n_cpu = meta_config.get_n_cpu()
        print "n_cpu = ", n_cpu
        # HACK
#        n_cpu = 8
    result_fit = comb_pdf.fitTo(data, RooFit.Minos(ROOT.kTRUE),
                                RooFit.Extended(ROOT.kTRUE),
                                RooFit.NumCPU(n_cpu))
    # HACK end
    if bkg_in_region:
        x.setRange("signal", bkg_in_region[0], bkg_in_region[1])
        bkg_pdf_fitted = comb_pdf.pdfList()[1]
        int_argset = RooArgSet(x)
        #        int_argset = x
        #        int_argset.setRange("signal", bkg_in_region[0], bkg_in_region[1])
        integral = bkg_pdf_fitted.createIntegral(int_argset,
                                                 RooFit.NormSet(int_argset),
                                                 RooFit.Range("signal"))
        bkg_cdf = bkg_pdf_fitted.createCdf(int_argset, RooFit.Range("signal"))
        bkg_cdf.plotOn(x_frame1)

        #        integral.plotOn(x_frame1)
        n_bkg_below_sig = integral.getVal(int_argset) * n_bkg.getVal()
        x_frame1.Draw()

    if plot_importance >= 3:
        c2 = TCanvas("c2", "RooFit pdf fit vs " + data_name)
        c2.cd()
        x_frame = x.frame()
        #        if log_plot:
        #            c2.SetLogy()
        #        x_frame.SetTitle("RooFit pdf vs " + data_name)
        x_frame.SetTitle(data_name)
        if pulls:
            pad_data = ROOT.TPad("pad_data", "Pad with data and fit", 0, 0.33,
                                 1, 1)
            pad_pulls = ROOT.TPad("pad_pulls", "Pad with data and fit", 0, 0,
                                  1, 0.33)
            pad_data.SetBottomMargin(0.00001)
            pad_data.SetBorderMode(0)
            if log_plot:
                pad_data.SetLogy()
            pad_pulls.SetTopMargin(0.00001)
            pad_pulls.SetBottomMargin(0.2)
            pad_pulls.SetBorderMode(0)
            pad_data.Draw()
            pad_pulls.Draw()
            pad_data.cd()
        else:
            if log_plot:
                c2.SetLogy()
    if blind:
        # HACK
        column = 'x'
        # END HACK
        x.setRange("lower", min_x, lower_blind)
        x.setRange("upper", upper_blind, max_x)
        range_str = "lower,upper"
        lower_cut_str = str(
            min_x) + "<=" + column + "&&" + column + "<=" + str(lower_blind)
        upper_cut_str = str(
            upper_blind) + "<=" + column + "&&" + column + "<=" + str(max_x)
        sideband_cut_str = "(" + lower_cut_str + ")" + "||" + "(" + upper_cut_str + ")"

        n_entries = data.reduce(
            sideband_cut_str).numEntries() / data.numEntries()
        #        raw_input("n_entries: " + str(n_entries))
        if plot_importance >= 3:
            data.plotOn(x_frame, RooFit.CutRange(range_str),
                        RooFit.NormRange(range_str))
            comb_pdf.plotOn(
                x_frame, RooFit.Range(range_str),
                RooFit.Normalization(n_entries, RooAbsReal.Relative),
                RooFit.NormRange(range_str))
            if pulls:
                #                pull_hist(pull_frame=x_frame, pad_data=pad_data, pad_pulls=pad_pulls)
                x_frame_pullhist = x_frame.pullHist()
    else:
        if plot_importance >= 3:
            data.plotOn(x_frame)
            comb_pdf.plotOn(x_frame)
            if pulls:
                pad_pulls.cd()
                x_frame_pullhist = x_frame.pullHist()
                pad_data.cd()

            comb_pdf.plotOn(x_frame,
                            RooFit.Components(sig_pdf.namePtr().GetName()),
                            RooFit.LineStyle(ROOT.kDashed))
            comb_pdf.plotOn(x_frame,
                            RooFit.Components(bkg_pdf.namePtr().GetName()),
                            RooFit.LineStyle(ROOT.kDotted))
#            comb_pdf.plotPull(n_sig)

    if plot_importance >= 3:
        x_frame.Draw()

        if pulls:
            pad_pulls.cd()
            x_frame.SetTitleSize(0.05, 'Y')
            x_frame.SetTitleOffset(0.7, 'Y')
            x_frame.SetLabelSize(0.04, 'Y')

            #            c11 = TCanvas("c11", "RooFit\ pulls" + data_name)
            #            c11.cd()
            #            frame_tmp = x_frame
            frame_tmp = x.frame()

            #            frame_tmp.SetTitle("significance")

            frame_tmp.SetTitle("Roofit\ pulls\ " + data_name)
            frame_tmp.addObject(x_frame_pullhist)

            frame_tmp.SetMinimum(-5)
            frame_tmp.SetMaximum(5)

            #            frame_tmp.GetYaxis().SetTitle("significance")
            frame_tmp.GetYaxis().SetNdivisions(5)
            frame_tmp.SetTitleSize(0.1, 'X')
            frame_tmp.SetTitleOffset(1, 'X')
            frame_tmp.SetLabelSize(0.1, 'X')
            frame_tmp.SetTitleSize(0.1, 'Y')
            frame_tmp.SetTitleOffset(0.5, 'Y')
            frame_tmp.SetLabelSize(0.1, 'Y')

            frame_tmp.Draw()

#    raw_input("")

    if not blind and nll_profile:

        #        nll_range = RooRealVar("nll_range", "Signal for nLL", n_sig.getVal(),
        #                               -10, 2 * n_sig.getVal())
        sframe = n_sig.frame(RooFit.Bins(20), RooFit.Range(1, 1000))
        # HACK for best n_cpu
        lnL = comb_pdf.createNLL(data, RooFit.NumCPU(8))
        # HACK end
        lnProfileL = lnL.createProfile(ROOT.RooArgSet(n_sig))
        lnProfileL.plotOn(sframe, RooFit.ShiftToZero())
        c4 = TCanvas("c4", "NLL Profile")
        c4.cd()

        #        input("press ENTER to show plot")
        sframe.Draw()

    if plot_importance >= 3:
        pass

    params = comb_pdf.getVariables()
    params.Print("v")

    #    print bkg_cdf.getVal()

    if sPlot:
        sPlotData = ROOT.RooStats.SPlot(
            "sPlotData",
            "sPlotData",
            data,  # variable fitted to, RooDataSet
            comb_pdf,  # fitted pdf
            ROOT.RooArgList(
                n_sig,
                n_bkg,
                #                                                NSigB0s
            ))
        sweights = np.array([
            sPlotData.GetSWeight(i, 'n_sig') for i in range(data.numEntries())
        ])
        return n_sig.getVal(), n_bkg_below_sig, sweights

    if blind:
        return blind_n_sig.getVal(), n_bkg_below_sig, comb_pdf
    else:
        return n_sig.getVal(), n_bkg_below_sig, comb_pdf