コード例 #1
0
 def warn_good_import(added, deleted, form, modified):
     if deleted.empty and modified.empty:
         h.showbox(
             '''There are %s rows of new data and no unexpected changes to old data.
                         Please proceed with <code>data["raw"]</code>.''' %
             len(added), form + ': Importing Data', 'success')
         h.showdataframe(added)
コード例 #2
0
    def read_data(self, form):
        old = pd.read_csv(os.path.join(self.downloads_dir, self.olddate,
                                       form + '.csv'),
                          low_memory=False)
        new = pd.read_csv(os.path.join(self.downloads_dir, self.newdate,
                                       form + '.csv'),
                          low_memory=False)

        deleted = h.difference(old, new, 'id')
        modified = h.intersection_both(old, new, 'id', sources=['old', 'new'])
        added = h.difference(new, old, 'id').copy()
        form_complete = '%s_complete' % form
        added[form_complete] = 1
        added['common_complete'] = 1
        merged = self.read_redcap(form, added)

        # display dialogues
        self.warn_deleted(deleted, form)
        self.warn_modified(form, modified)
        self.warn_good_import(added, deleted, form, modified)

        return {
            'raw': new,
            'added': added,
            'deleted': deleted,
            'modified': modified,
            'merged': merged
        }
コード例 #3
0
 def warn_deleted(deleted, form):
     if not deleted.empty:
         h.showbox(
             '''There are %s rows in the old data that has been removed in the new data.
                         If this is expected, you can ignore this message.
                         To further inspect rows type <code>data["deleted"]</code>'''
             % len(deleted), form + ': Deleted', 'danger')
         h.showdataframe(deleted)
コード例 #4
0
 def warn_modified(form, modified):
     if not modified.empty:
         h.showbox(
             '''There are %s rows in the old data that has been modified in the new data.
                         If this is expected, you can ignore this message.
                         To further inspect rows type <code>data["modified"]</code>'''
             % len(modified), form + ': Modified', 'danger')
         h.showdataframe(modified)
コード例 #5
0
 def warn_duplicates(duplicates, form):
     if duplicates.empty:
         h.showbox('''All patientid + patienttype combos are unique.''',
                   form + ': No Duplicates', 'success')
     else:
         h.showbox(
             '''There are %s rows that contain the same patientid + patienttype.'''
             % len(duplicates), form + ': Duplicates', 'danger')
         h.showdataframe(duplicates)
コード例 #6
0
 def warn_missing(missing, form):
     if missing.empty:
         h.showbox('''All patientid's are in New Data.''',
                   form + ': No Missing Redcap Subjects', 'success')
     else:
         h.showbox(
             '''There are %s Redcap subjects missing from the current data.'''
             % len(missing), form + ': Redcap Subjects Missing', 'danger')
         h.showdataframe(missing)
コード例 #7
0
 def warn_not_in_redcap(not_in_redcap, form):
     if not_in_redcap.empty:
         h.showbox('''All patientid's are in Redcap.''',
                   form + ': No Subject Missing from Redcap', 'success')
     else:
         h.showbox(
             '''There are %s rows with patientid missing from Redcap.''' %
             len(not_in_redcap), form + ': Subjects Missing from Redcap',
             'danger')
         h.showdataframe(not_in_redcap)
コード例 #8
0
def plot_deviation_test(file_path, output_formats=["pdf"]):
    """ Plot the (absolute) effect of the cos(theta) cut on the distributions.
  """
    # Read the input file
    log.debug("Reading file: {}".format(file_path))
    reader = IOR.Reader(file_path)

    # Output info
    output_dir = "{}/plots".format(os.path.dirname(file_path))
    base_name = os.path.basename(file_path).replace("_valdata.csv", "")
    log.debug("Output will be written to: {}".format(output_dir))

    # Get the pandas dataframe for the cut histograms
    df = reader["Data"]
    scale_factor = VMCC.TestLumi * reader["CrossSection"] / reader["NTotalMC"]

    bin_centers = reader["BinCenters"]
    n_bins = len(bin_centers)
    n_dims = len(reader["CoordName"])

    row_cut0 = df[(df["Delta-c"] == 0) & (df["Delta-w"] == 0)]
    N_cut_cut0 = np.array([row_cut0["C{}".format(b)] for b in range(n_bins)])
    N_par_cut0 = np.array([row_cut0["P{}".format(b)] for b in range(n_bins)])

    # Find the bin edges for each dimension
    bin_edges = [
        np.linspace(reader["CoordMin"][d], reader["CoordMax"][d],
                    reader["CoordNBins"][d] + 1) for d in range(n_dims)
    ]

    # Find the deltas and the minimum and maximum deviations
    deltas = IOPH.delta_pairs(df)
    delta_metrics = VMDH.delta_metric(deltas)
    colors = PHC.ColorSpectrum("turbo", -1.1 * np.amax(delta_metrics),
                               1.1 * np.amax(delta_metrics))

    for dev_dir in tqdm(dev_directions, desc="Dev. dir. loop", leave=False):
        log.debug("Looking at direction {}".format(dev_dir.name))
        dir_selection = dev_dir.func(deltas)
        dir_rows = df[dir_selection]
        dir_deltas = deltas[dir_selection]
        deltas_in_dir = delta_in_dir(dev_dir.name, dir_deltas)

        N_cut = np.array([dir_rows["C{}".format(b)] for b in range(n_bins)])
        N_par = np.array([dir_rows["P{}".format(b)] for b in range(n_bins)])

        diff_c0 = np.sqrt(scale_factor) * ratio(N_cut - N_cut_cut0,
                                                np.sqrt(N_cut_cut0))
        diff_p0 = np.sqrt(scale_factor) * ratio(N_par - N_cut_cut0,
                                                np.sqrt(N_cut_cut0))
        diff_pc = np.sqrt(scale_factor) * ratio(N_par - N_cut, np.sqrt(N_cut))

        title = "{}, ${}$ab$^{{-1}}$".format(VTN.metadata_to_process(reader),
                                             VMCC.TestLumi / 1000)
        legend_title = "Shift {}\n$\Delta {}$ $[\delta={}]$".format(
            dev_dir.name, dev_dir.coord, reader["Delta"])

        for d in tqdm(range(n_dims), desc="Dim. loop", leave=False):
            x = bin_centers[:, d]
            x_min, x_max = bin_edges[d][0], bin_edges[d][-1]
            coord_name = "${}$".format(
                VTN.name_to_coord(reader["CoordName"][d]))

            # definitions for the axes
            figsize = (12, 10)
            left, width = 0.17, 0.49
            bottom, height = 0.12, 0.49
            spacing = 0.005
            rect_scatter = [left, bottom, width, height]
            rect_histx = [
                left, bottom + height + spacing, width,
                0.93 - (bottom + height + spacing)
            ]
            rect_histy = [
                left + width + spacing, bottom,
                0.93 - (left + width + spacing), height
            ]
            leg_pos = [(width + spacing) / width, (height + spacing) / height]

            # Common plotting arguments
            common_sc_kwargs = {"color": 'none', "linewidths": 2, "s": 10**2}
            common_hist_kwargs = {"histtype": 'step', "fill": False, "lw": 2}

            #--- Plot N_cut - N_cut0 -------------------------------------------------

            # start with a rectangular Figure
            fig = plt.figure(figsize=figsize)
            fig.suptitle(title)  #, fontsize=16)

            ax_scatter = plt.axes(rect_scatter)
            ax_scatter.tick_params(direction='in', top=True, right=True)
            ax_scatter.set_xlabel(coord_name)
            ax_scatter.set_ylabel(
                r"$\left(N_{cut}^{(\Delta c,\Delta w)} - N_{cut}^{0}\right)/\sqrt{N_{cut}^{0}}$"
            )
            ax_histx = plt.axes(rect_histx)
            ax_histx.tick_params(direction='in', labelbottom=False)
            ax_histx.set_ylabel("$\sum_{bins} y^2$")
            ax_histy = plt.axes(rect_histy)
            ax_histy.tick_params(direction='in', labelleft=False)
            ax_histy.set_xlabel("#bins")

            y_bin_edges = np.linspace(np.amin(diff_c0), np.amax(diff_c0), 20)

            for row in range(len(dir_deltas)):
                y = diff_c0[:, row]
                color = colors[deltas_in_dir[row]]
                scatter = ax_scatter.scatter(
                    x,
                    y,
                    edgecolors=color,
                    marker=PHM.markers[row],
                    label=r"${}$".format(deltas_in_dir[row] / reader["Delta"]),
                    **common_sc_kwargs)

                ax_histx.hist(x,
                              bins=bin_edges[d],
                              weights=y**2,
                              ec=color,
                              **common_hist_kwargs)
                ax_histy.hist(y,
                              bins=y_bin_edges,
                              orientation='horizontal',
                              ec=color,
                              **common_hist_kwargs)

            ax_scatter.set_xlim((x_min, x_max))
            ax_histx.set_xlim(ax_scatter.get_xlim())
            ax_histy.set_ylim(ax_scatter.get_ylim())

            ax_scatter.legend(loc=leg_pos, title=legend_title, ncol=2)

            # Save the figure in all requested formats
            for format in output_formats:
                format_dir = "{}/{}/DevCutCut0".format(output_dir, format)
                IOSH.create_dir(format_dir)
                dev_dir_name = dev_dir.name.replace(" ", "_")
                fig.savefig("{}/{}_{}_DevCutCut0_{}.{}".format(
                    format_dir, base_name, reader["CoordName"][d],
                    dev_dir_name, format))

            plt.close(fig)

            #--- Only scatter plot N_cut - N_cut0 ------------------------------------

            fig = plt.figure(figsize=(9, 7), tight_layout=True)
            ax_scatter = plt.gca()
            ax_scatter.set_title(title)
            ax_scatter.set_xlabel(coord_name)
            ax_scatter.set_ylabel(
                r"$\left(N_{cut}^{(\Delta c,\Delta w)} - N_{cut}^{0}\right)/\sqrt{N_{cut}^{0}}$"
            )

            for row in range(len(dir_deltas)):
                scatter = ax_scatter.scatter(
                    x,
                    diff_c0[:, row],
                    edgecolors=colors[deltas_in_dir[row]],
                    marker=PHM.markers[row],
                    label=r"${}$".format(deltas_in_dir[row] / reader["Delta"]),
                    **common_sc_kwargs)

            ax_scatter.set_xlim((x_min, x_max))
            ax_scatter.legend(title=legend_title, ncol=3)

            # Save the figure in all requested formats
            for format in output_formats:
                format_dir = "{}/{}/DevCutCut0".format(output_dir, format)
                IOSH.create_dir(format_dir)
                dev_dir_name = dev_dir.name.replace(" ", "_")
                fig.savefig("{}/{}_{}_DevCutCut0_ScatterOnly_{}.{}".format(
                    format_dir, base_name, reader["CoordName"][d],
                    dev_dir_name, format))

            plt.close(fig)

            #--- Plot N_par - N_cut0 -------------------------------------------------

            # start with a rectangular Figure
            fig = plt.figure(figsize=figsize)
            fig.suptitle(title)  #, fontsize=16)

            ax_scatter = plt.axes(rect_scatter)
            ax_scatter.tick_params(direction='in', top=True, right=True)
            ax_scatter.set_xlabel(coord_name)
            ax_scatter.set_ylabel(
                r"$\left(N_{par}^{(\Delta c,\Delta w)} - N_{cut}^{0}\right)/\sqrt{N_{cut}^{0}}$"
            )
            ax_histx = plt.axes(rect_histx)
            ax_histx.tick_params(direction='in', labelbottom=False)
            ax_histx.set_ylabel("$\sum_{bins} y^2$")
            ax_histy = plt.axes(rect_histy)
            ax_histy.tick_params(direction='in', labelleft=False)
            ax_histy.set_xlabel("#bins")

            y_bin_edges = np.linspace(np.amin(diff_p0), np.amax(diff_p0), 20)

            for row in range(len(dir_deltas)):
                y = diff_p0[:, row]
                color = colors[deltas_in_dir[row]]
                scatter = ax_scatter.scatter(
                    x,
                    y,
                    edgecolors=color,
                    marker=PHM.markers[row],
                    label=r"${}$".format(deltas_in_dir[row] / reader["Delta"]),
                    **common_sc_kwargs)

                ax_histx.hist(x,
                              bins=bin_edges[d],
                              weights=y**2,
                              ec=color,
                              **common_hist_kwargs)
                ax_histy.hist(y,
                              bins=y_bin_edges,
                              orientation='horizontal',
                              ec=color,
                              **common_hist_kwargs)

            ax_scatter.set_xlim((x_min, x_max))
            ax_histx.set_xlim(ax_scatter.get_xlim())
            ax_histy.set_ylim(ax_scatter.get_ylim())

            ax_scatter.legend(loc=leg_pos, title=legend_title, ncol=2)

            # Save the figure in all requested formats
            for format in output_formats:
                format_dir = "{}/{}/DevParCut0".format(output_dir, format)
                IOSH.create_dir(format_dir)
                dev_dir_name = dev_dir.name.replace(" ", "_")
                fig.savefig("{}/{}_{}_DevParCut0_{}.{}".format(
                    format_dir, base_name, reader["CoordName"][d],
                    dev_dir_name, format))

            plt.close(fig)

            #--- Plot N_par - N_cut --------------------------------------------------

            # start with a rectangular Figure
            fig = plt.figure(figsize=figsize)
            fig.suptitle(title)  #, fontsize=16)

            ax_scatter = plt.axes(rect_scatter)
            ax_scatter.tick_params(direction='in', top=True, right=True)
            ax_scatter.set_xlabel(coord_name)
            ax_scatter.set_ylabel(
                r"$\left(N_{par}^{(\Delta c,\Delta w)} - N_{cut}^{(\Delta c,\Delta w)}\right)/\sqrt{N_{cut}^{(\Delta c,\Delta w)}}$"
            )
            ax_histx = plt.axes(rect_histx)
            ax_histx.tick_params(direction='in', labelbottom=False)
            ax_histx.set_ylabel("$\sum_{bins} y^2$")
            ax_histy = plt.axes(rect_histy)
            ax_histy.tick_params(direction='in', labelleft=False)
            ax_histy.set_xlabel("#bins")

            y_bin_edges = np.linspace(np.amin(diff_pc), np.amax(diff_pc), 20)

            for row in range(len(dir_deltas)):
                y = diff_pc[:, row]
                color = colors[deltas_in_dir[row]]
                scatter = ax_scatter.scatter(
                    x,
                    y,
                    edgecolors=color,
                    marker=PHM.markers[row],
                    label=r"${}$".format(deltas_in_dir[row] / reader["Delta"]),
                    **common_sc_kwargs)

                ax_histx.hist(x,
                              bins=bin_edges[d],
                              weights=y**2,
                              ec=color,
                              **common_hist_kwargs)
                ax_histy.hist(y,
                              bins=y_bin_edges,
                              orientation='horizontal',
                              ec=color,
                              **common_hist_kwargs)

            ax_scatter.set_xlim((x_min, x_max))
            ax_histx.set_xlim(ax_scatter.get_xlim())
            ax_histy.set_ylim(ax_scatter.get_ylim())

            ax_scatter.legend(loc=leg_pos, title=legend_title, ncol=2)

            # Save the figure in all requested formats
            for format in output_formats:
                format_dir = "{}/{}/DevParCut".format(output_dir, format)
                IOSH.create_dir(format_dir)
                dev_dir_name = dev_dir.name.replace(" ", "_")
                fig.savefig("{}/{}_{}_DevParCut_{}.{}".format(
                    format_dir, base_name, reader["CoordName"][d],
                    dev_dir_name, format))

            plt.close(fig)
コード例 #9
0
def plot_chi_squared_test(file_path, output_formats=["pdf"]):
    # TODO TODO TODO DESCRIPTION

    # Read the input file
    log.debug("Reading file: {}".format(file_path))
    reader = IOR.Reader(file_path)

    # Output info
    output_dir = "{}/plots".format(os.path.dirname(file_path))
    base_name = os.path.basename(file_path).replace("_valdata.csv", "")
    log.debug("Output will be written to: {}".format(output_dir))

    # Get the pandas dataframe for the cut histograms
    df = reader["Data"]
    n_bins = len(reader["BinCenters"])

    row_cut0 = df[(df["Delta-c"] == 0) & (df["Delta-w"] == 0)]
    N_cut_cut0 = np.array(
        [row_cut0["C{}".format(b)].values[0] for b in range(n_bins)])

    # Get scale factor to normalise distribution to the (roughly) number of events
    # expected during the fit
    scale_factor = VMCC.TestLumi * reader["CrossSection"] / reader["NTotalMC"]
    N_cut_cut0 *= scale_factor

    # Find the deltas
    deltas = IOPH.delta_pairs(df)
    delta_metrics = VMDH.delta_metric(deltas)

    # Maximum sqrt(dc**2 + dw**2) that should be included in chi-squared calc.
    # -> Don't use outermost test values, not bad if not exact fit there
    d_max = 2.0 * reader["Delta"]

    # Chi squared arrays for each dev dir
    chi_sq_pc = []
    chi_sq_c0 = []

    for dev_dir in dev_directions:
        log.debug("Looking at direction {}".format(dev_dir.name))

        # Get the rows for this direction which fulfill the d_max criterium
        dir_selection = dev_dir.func(deltas)
        d_max_selection = delta_metrics <= d_max
        selection = np.logical_and(dir_selection, d_max_selection)
        dir_rows = df[selection]
        dir_deltas = deltas[selection]
        n_dev_points = len(dir_deltas)

        N_cut = np.array([dir_rows["C{}".format(b)]
                          for b in range(n_bins)]) * scale_factor
        N_par = np.array([dir_rows["P{}".format(b)]
                          for b in range(n_bins)]) * scale_factor

        diff_pc_sq = (N_par - N_cut)**2
        diff_c0_sq = ((N_cut.transpose() - N_cut_cut0)**2).transpose()

        dir_chi_sq_pc = []
        dir_chi_sq_c0 = []

        # Calculate the chi-squared for each bin
        for d in range(n_dev_points):
            dev_chi_sq_pc = 0
            dev_chi_sq_c0 = 0
            diff_pc_sq_d = diff_pc_sq[:, d]
            diff_c0_sq_d = diff_c0_sq[:, d]
            N_cut_d = N_cut[:, d]

            for b in range(n_bins):
                if not N_cut_d[b] > 0:
                    if abs(diff_pc_sq_d[b]) > 0:
                        log.warning(
                            "Bin {} at deviation ({}) has 0 for cut and non-0 for parametrisation"
                            .format(b, dir_deltas[d]))
                elif np.all(N_cut[b] == N_cut_cut0[b]):
                    # Skip bins that aren't affected by the cut
                    # Their contribution to each chi^2 is zero anyway
                    continue
                else:
                    dev_chi_sq_pc += diff_pc_sq_d[b] / N_cut_d[b]
                    dev_chi_sq_c0 += diff_c0_sq_d[b] / N_cut_cut0[b]

            dir_chi_sq_pc.append(dev_chi_sq_pc)
            dir_chi_sq_c0.append(dev_chi_sq_c0)

        chi_sq_pc.append(dir_chi_sq_pc)
        chi_sq_c0.append(dir_chi_sq_c0)

    # --- Plotting ---------------------------------------------------------------

    # start with a rectangular Figure
    fig = plt.figure(figsize=(7.5, 6), tight_layout=True)

    ax_scatter = plt.gca()
    title = "{}, ${}$ab$^{{-1}}$".format(VTN.metadata_to_process(reader),
                                         VMCC.TestLumi / 1000)
    ax_scatter.set_title(title)
    ax_scatter.set_xlabel(r"$\chi_{shift}^{2}$", fontsize=26)
    ax_scatter.set_ylabel(r"$\chi_{mismodel}^{2}$", fontsize=26)
    # ax_scatter.set_xlabel(r"$\chi_{shift}^{2} = \sum_{bins} \left(\frac{N_{cut}^{(\Delta c, \Delta w)} - N_{cut}^{0}}{\sqrt{N_{cut}^{0}}}\right)^2$")
    # ax_scatter.set_ylabel(r"$\chi_{par}^{2} = \sum_{bins} \left(\frac{N_{par}^{(\Delta c, \Delta w)} - N_{cut}^{(\Delta c, \Delta w)}}{\sqrt{N_{cut}^{(\Delta c, \Delta w)}}}\right)^2$")

    # Set logarithmic axes
    x_min = min([min(c) for c in chi_sq_c0])
    x_max = max([max(c) for c in chi_sq_c0])
    y_min = min([min(c) for c in chi_sq_pc])
    y_max = max([max(c) for c in chi_sq_pc])
    edge_min = 0.5 * y_min
    edge_max = 1.5 * max(x_max, y_max)
    log_edge_min = np.log10(edge_min)
    log_edge_max = np.log10(edge_max)
    edges = np.logspace(log_edge_min, log_edge_max, 16)
    ax_scatter.set_yscale('log')
    ax_scatter.set_xscale('log')
    ax_scatter.set_ylim(edge_min, edge_max)
    ax_scatter.set_xlim(edge_min, edge_max)

    # Draw diagonal axis line, everything below that line is fine
    ax_scatter.fill_between(edges,
                            edges,
                            edge_max * np.ones(16),
                            color='red',
                            alpha=0.5)
    ax_scatter.axline((edge_min, edge_min), (edge_max, edge_max),
                      ls='--',
                      color='black')

    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']

    for i_dir in range(len(dev_directions)):
        scatter = ax_scatter.scatter(chi_sq_c0[i_dir],
                                     chi_sq_pc[i_dir],
                                     color='none',
                                     ec=colors[i_dir],
                                     lw=2,
                                     s=10**2,
                                     marker=PHM.markers[i_dir],
                                     label=dev_directions[i_dir].name)

    legend_title = r"$\cos\theta_{{\mu}}^{{cut}}={}$,".format(
        reader["Coef|MuonAcc_CutValue"]
    ) + "\n" + r"$\sqrt{{\Delta c^2 + \Delta w^2}} \leq {}\delta$".format(
        d_max / reader["Delta"])
    ax_scatter.legend(title=legend_title, loc="upper left")

    # Save the figure in all requested formats
    for format in output_formats:
        format_dir = "{}/{}/ChiSquared".format(output_dir, format)
        IOSH.create_dir(format_dir)
        fig.savefig("{}/{}_ChiSquared.{}".format(format_dir, base_name,
                                                 format))

    plt.close(fig)
コード例 #10
0
import pandas as pd
import PandasHelper as pdh
from PandasHelper import FileType
import re
import pandasql as pdsql

path = 'C:\\Users\\NickTsai\\Desktop\\PandaPractice\\pokemon_data.json'
path2 = 'C:\\Users\\NickTsai\\Desktop\\PandaPractice\\pokemon_data.xlsx'

df = pdh.read_file_as_datafram(file_path=path, file_type=FileType.Json)
dfs = pdh.read_file_as_datafram(file_path=path2, file_type=FileType.Excel)

# datafram filtering
# contains
df2 = df.loc[(df['Type 1'] == 'Grass') & (df['Name'].str.contains('Mega'))]

# not contains
df3 = df.loc[(df['Type 1'] == 'Grass') & (~df['Name'].str.contains('Mega'))]

# datafram index reset
df2.reset_index(drop=True, inplace=True)

# sorting
# one condtion
df4 = df.sort_values(['Name'], ascending=False)
# Multi condition sorting
df5 = df.sort_values(['Name', 'Type 1'], ascending=True)

# Regular expression
df6 = df.loc[(df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True))]
df7 = df.loc[(df['Name'].str.contains('^pi[a-z]*', flags=re.I, regex=True))]