Example #1
0
def plot_reweighted(reweighted_vars, DATAS_df, MC_df):
    
    import matplotlib.pyplot as plt
    from matplotlib_hep import histpoints
    
    # Plot the reweighted variables
    for id, col in enumerate(reweighted_vars, 1):
        #print id, col
        #print MC_df.name
        plt.figure()

        if col=='B_PT':
            range =(0, 50000)
            #plt.xlim(0.0, 50000)
        if col =='nTracks':
            range=(0,600)
        else:
            range=None

        data_col = DATAS_df[col].values
        mc_col   = MC_df[col].values
        #mc_rewcol= MC_df['weights_{}'.format(col)].values

        y_data, y_data, norm_data = histpoints(data_col, range=range, color='black', normed=True, bins=100, \
                                               label='sWeighted DATA', markersize=3. , marker='o', weights=DATAS_df['nSig_sw'])
        
        x_uncorr, y_uncorr, norm_uncorr = histpoints(mc_col, range=range, color='red', normed=True, bins=100, label='MC', markersize=3., marker='o')
        
        x_corr, y_corr, norm_corr = histpoints(mc_col, range=range, color= 'green', normed=True,  bins=100, weights=MC_df['wghts_tot'], markersize=3., \
                                               label='Reweighted MC', marker='o')



        plt.xlabel(vars_to_reweight_dict[col][0])
        plt.ylabel('Events')


        plt.legend(prop={'size': 10})
    
        plt.show()
Example #2
0
def main():
    logger  = logging.getLogger(__name__)
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +  '%(message)s'), level=logging.DEBUG)
    logger.debug(matplotlib.matplotlib_fname())

    args = docopt(__doc__)
    path = args["<datafile>"]
    out  = args["<outputfile>"]

    tablename   = args["--tablename"]
    bins        = int(args["--bins"])
    title       = args["--title"]

    first_night = args["--first"]
    last_night  = args["--last"]

    theta_cut = float(args['--theta2-cut'])
    prediction_threshold = float(args['--threshold'])

    alpha = 0.2

    df = pd.read_hdf(path, key=tablename)
    df.columns = [c.replace(':', '_') for c in df.columns]

    if first_night:
        df = df.query('(NIGHT >= {})'.format(first_night)).copy()
        logger.info('Using only Data begining with night {}'.format(first_night))

    if last_night:
        df = df.query('(NIGHT <= {})'.format(last_night)).copy()
        logger.info('Using only Data until night {}'.format(last_night))


    night_stats = df.NIGHT.describe()
    actual_first_night = night2datetime(int(night_stats['min']))
    actual_last_night = night2datetime(int(night_stats['max']))

    period = 'Period: {:%Y-%m-%d} to {:%Y-%m-%d}'.format(actual_first_night, actual_last_night)
    logger.debug('Using Nights from {}'.format(period))

    theta_keys = ["Theta"] + ['Theta_Off_{}'.format(off_position) for off_position in range(1, 6)]


    df[theta_keys] = df[theta_keys].apply(theta_mm_to_theta_squared_deg, axis=0)


    # best_significance = 0
    # prediction_threshold = 0
    # theta_cut = 0
    # for threshold in np.linspace(0.5, 1, 20):
    #     df_signal = df.query('(prediction_on > {})'.format(threshold))
    #     df_background = df.query('(background_prediction > {})'.format(threshold))
    #
    #
    #     signal_theta = df_signal['signal_theta'].values
    #     background_theta = df_background['background_theta'].values
    #     theta_cuts = np.linspace(0.5, 0.001, 50)
    #     for theta_cut in theta_cuts:
    #         n_off = len(background_theta[background_theta < theta_cut])
    #         n_on =len(signal_theta[signal_theta < theta_cut])
    #         significance = li_ma_significance(n_on, n_off, alpha=alpha)
    #         if significance > best_significance:
    #             theta_cut = theta_cut
    #             best_significance = significance
    #             prediction_threshold = threshold

    best_significance = 0
    best_prediction_threshold = 0
    best_theta_cut = 0

    for threshold in np.linspace(0.5, 1, 20):
        t_on = df['Theta'][df['prediction_on' ] > threshold]
        t_off = pd.Series()
        for off_position in range(1, 6):
            mask = df['prediction_off_{}'.format(off_position)] > threshold
            t_off = t_off.append(df['Theta_Off_{}'.format(off_position)][mask])
            
        for t_cut in np.linspace(0.5, 0.001, 50):
            n_on = len(t_on[t_on < t_cut])
            n_off = len(t_off[t_off < t_cut])
            significance = li_ma_significance(n_on, n_off, alpha=alpha)
            if significance > best_significance:
                best_theta_cut = t_cut
                best_significance = significance
                best_prediction_threshold = threshold

    logger.info('Maximum Significance {}, for Theta Cut at {} and confidence cut at {}'.format(best_significance, best_theta_cut, best_prediction_threshold))


    theta_on = df['Theta'][df['prediction_on' ] > prediction_threshold]
    theta_off = pd.Series()
    for off_position in range(1, 6):
        mask = df['prediction_off_{}'.format(off_position)] > prediction_threshold
        theta_off = theta_off.append(df['Theta_Off_{}'.format(off_position)][mask])

    n_on = len(theta_on[theta_on < theta_cut])
    n_off = len(theta_off[theta_off < theta_cut])
    logger.info('N_on = {}, N_off = {}'.format(n_on, n_off))

    excess_events = n_on - alpha * n_off
    significance = li_ma_significance(n_on, n_off, alpha=alpha)

    logger.info(
        'Chosen cuts for prediction threshold {} has signifcance: {} with  a theta sqare cut of {}.'.format(
            prediction_threshold, significance, theta_cut
    ))

    theta_max = 0.3
    bins = np.linspace(0, theta_max, bins)

    #Define measures for plot and info box
    info_left = 0.0
    info_height = 0.3
    info_width = 1.0
    info_bottom = 1.
    info_top    = info_bottom + info_height
    info_right  = info_left + info_width
    plot_height = 1. - info_height
    plot_width  = theta_max


    fig = plt.figure()
    fig.subplots_adjust(top=plot_height)
    ax = fig.gca()

    #Plot the Theta2 Distributions
    sig_x, sig_y, sig_norm = histpoints(theta_on, bins=bins, xerr='binwidth', label='On', fmt='none', ecolor='b', capsize=0)
    back_x, back_y, back_norm = histpoints(theta_off, bins=bins, xerr='binwidth', label='Off', fmt='none', ecolor='r', capsize=0, scale=alpha, yerr='sqrt')

    #Fill area underneeth background
    ax.fill_between(back_x, back_y[1], 0, facecolor='grey', alpha=0.2, linewidth=0.0)

    #Mark theta cut with a line0.5*(info_left+info_right),
    ax.axvline(x=theta_cut, linewidth=1, color='k', linestyle='dashed')

    # embed()

    # Draw info Box
    p = patches.Rectangle( (info_left, 1.), info_width, info_height, fill=True, transform=ax.transAxes, clip_on=False, facecolor='0.9', edgecolor='black')
    ax.add_patch(p)

    info_text = 'Significance: {:.2f}, Alpha: {:.2f}\n'.format(significance, alpha)
    if period:
        info_text = period + ',\n' + info_text
    info_text += 'Confidence Cut: {:.2f}, Theta Sqare Cut: {:.2f} \n'.format(prediction_threshold, theta_cut)
    info_text += '{:.2f} excess events, {:.2f} background events \n'.format(excess_events, n_off)

    ax.text(0.5*(info_left+info_right), 0.5*(info_top+info_bottom)-0.05, info_text,
            horizontalalignment='center', verticalalignment='center', fontsize=10, transform=ax.transAxes)

    ax.text(0.5*(info_left+info_right), (info_top), title,
                bbox={'facecolor':'white', 'pad':10},
                horizontalalignment='center',
                verticalalignment='center',
                fontsize=14, color='black',
                transform=ax.transAxes)

        # hist_background, edges , _ = plt.hist(df_background.values, bins=edges, alpha=0.6, label='Off region')
    # plt.xlabel("$//Theta^2 in mm^2$")
    plt.xlabel("Theta^2 / mm^2")
    plt.ylabel("Counts")

    plt.legend(fontsize=12)
    # plt.show()
    plt.savefig(out)
def main(outputfile, datatupels, ignorekeys, cuts, default_cuts):
    '''Plot Data MonteCarlo comparison plots from HDF5 files'''

    cuts = aggregatePlottingCuts(cuts, default_cuts)

    df_list, datafiles, scales, labels, common_keys = loadData(datatupels, cuts)

    if ignorekeys != None:
        common_keys = set(common_keys).difference(ignorekeys)
        for key in ignorekeys:
            logger.info("skipping column{}: on ignore list".format(key))

    picturePath = mkDirAtDestination(outputfile)

    with PdfPages(os.path.join(picturePath, os.path.basename(outputfile))) as pdf:
        logger.info("\nList of Keys:")
        for key in common_keys:
            logger.info(key)

            #skip tupples
            if isinstance(df_list[0][key].iloc[0], (list, tuple)):
                logger.info("skipping column {}: cannot interprete content".format(key))
                continue

            fig = plt.figure()
            plt.title(key)
            plot_option = None
            if key in default_plots:
                plot_option = default_plots[key]

                if plot_option == False:
                    plt.close()
                    continue


                data_range = calcDataRange(df_list, key)

                gc.collect()
                logger.info(default_plot_option)

                xlabel = key
                func = None
                xUnit=""

                if plot_option == None:
                    plot_option = default_plot_option
                else:
                    # embed()
                    func    = plot_option["func"]
                    xUnit   = plot_option["xUnit"]
                    xlabel  += " / " + xUnit

                    if func and func.__name__ and not "lambda" in func.__name__:
                        # embed()
                        func_name = str(func.__name__)
                        logger.info("Function: {}({})".format(func_name, key))
                        xlabel = func_name+"({})".format(xlabel)

                    del plot_option["func"]
                    del plot_option["xUnit"]

                    plot_option = merge_dicts(default_plot_option, plot_option)
                    try:
                        if "bins" and "range" in plot_option:
                            if not plot_option["range"] == None:
                                plot_option["bins"] = np.linspace(*plot_option["range"], plot_option["bins"])
                            else:
                                plot_option["bins"] = np.linspace(*data_range, plot_option["bins"])
                    except:
                        embed()

                for df, scale, label, c in zip(df_list, scales, labels, color_cycle()):
                    data = df[key]

                    if func:
                        data = func(data)

                    try:
                        # plt.hist(data.values, label=df["filename"].iloc[0], normed=scale, color=c["color"], **plot_option)
                        ax = fig.gca()
                        ax.grid(True)
                        x, y, norm = histpoints(data.values, xerr='binwidth', yerr="sqrt", label=label,
                                                fmt='none', capsize=0, normed=scale, ecolor=c["color"], **plot_option)
                        ax.fill_between(x, y[1], 0, alpha=0.2, linewidth=0.01, step='mid', facecolor=c["color"])
                        if "log" in plot_option:
                            if plot_option["log"]:
                                ax.set_yscale("log", nonposy='clip')
                        if "range" in plot_option:
                            ax.set_xlim(plot_option["range"])

                    except Exception:
                        logger.exception("Plotting failed for {} in file {}".format(key, df["filename"]))


                    plt.xlabel(xlabel)
                    plt.ylabel("Frequency")

                plt.legend(loc='best')

                plt.savefig(os.path.join(picturePath, key+".png"))
                # plt.show()
                pdf.savefig()

                plt.close()
                # We can also set the file's metadata via the PdfPages object:
                d = pdf.infodict()
                d['Title'] = 'Data MC Comparison plots'
                d['Author'] = u'Jens Buss'
                d['Subject'] = 'Comparison'
                d['Keywords'] = 'Data:{}\nCuts:{}'.format(str(", ".join(datafiles)), str(cuts))
                d['CreationDate'] = datetime.datetime.today()
                d['ModDate'] = datetime.datetime.today()
Example #4
0
    range_x = (np.min(mc_norm[col]), np.max(DATASNorm[col]))
    range_y = (np.min(mc_norm[col]), np.max(mc_norm[col]))

    range_ = ()
    if np.max(mc_norm[col]) > np.max(mc_norm[col]):
        range_ = range_y
        print(range_x, range_)
    elif np.max(mc_norm[col]) > np.max(DATASNorm[col]):
        range_ = range_x
        print(range_y, range_)

    y_data, y_data, norm_data = histpoints(data_col,
                                           range=range_,
                                           color='black',
                                           normed=True,
                                           bins=100,
                                           label='sWeighted DATA',
                                           markersize=3.,
                                           marker='o',
                                           weights=DATASNorm['nSig_sw'])

    x_uncorr, y_uncorr, norm_uncorr = histpoints(mc_col,
                                                 range=range_,
                                                 color='red',
                                                 normed=True,
                                                 bins=100,
                                                 label='MC',
                                                 markersize=3.,
                                                 marker='o')

    x_corr, y_corr, norm_corr = histpoints(