def warn_good_import(added, deleted, form, modified): if deleted.empty and modified.empty: h.showbox( '''There are %s rows of new data and no unexpected changes to old data. Please proceed with <code>data["raw"]</code>.''' % len(added), form + ': Importing Data', 'success') h.showdataframe(added)
def read_data(self, form): old = pd.read_csv(os.path.join(self.downloads_dir, self.olddate, form + '.csv'), low_memory=False) new = pd.read_csv(os.path.join(self.downloads_dir, self.newdate, form + '.csv'), low_memory=False) deleted = h.difference(old, new, 'id') modified = h.intersection_both(old, new, 'id', sources=['old', 'new']) added = h.difference(new, old, 'id').copy() form_complete = '%s_complete' % form added[form_complete] = 1 added['common_complete'] = 1 merged = self.read_redcap(form, added) # display dialogues self.warn_deleted(deleted, form) self.warn_modified(form, modified) self.warn_good_import(added, deleted, form, modified) return { 'raw': new, 'added': added, 'deleted': deleted, 'modified': modified, 'merged': merged }
def warn_deleted(deleted, form): if not deleted.empty: h.showbox( '''There are %s rows in the old data that has been removed in the new data. If this is expected, you can ignore this message. To further inspect rows type <code>data["deleted"]</code>''' % len(deleted), form + ': Deleted', 'danger') h.showdataframe(deleted)
def warn_modified(form, modified): if not modified.empty: h.showbox( '''There are %s rows in the old data that has been modified in the new data. If this is expected, you can ignore this message. To further inspect rows type <code>data["modified"]</code>''' % len(modified), form + ': Modified', 'danger') h.showdataframe(modified)
def warn_duplicates(duplicates, form): if duplicates.empty: h.showbox('''All patientid + patienttype combos are unique.''', form + ': No Duplicates', 'success') else: h.showbox( '''There are %s rows that contain the same patientid + patienttype.''' % len(duplicates), form + ': Duplicates', 'danger') h.showdataframe(duplicates)
def warn_missing(missing, form): if missing.empty: h.showbox('''All patientid's are in New Data.''', form + ': No Missing Redcap Subjects', 'success') else: h.showbox( '''There are %s Redcap subjects missing from the current data.''' % len(missing), form + ': Redcap Subjects Missing', 'danger') h.showdataframe(missing)
def warn_not_in_redcap(not_in_redcap, form): if not_in_redcap.empty: h.showbox('''All patientid's are in Redcap.''', form + ': No Subject Missing from Redcap', 'success') else: h.showbox( '''There are %s rows with patientid missing from Redcap.''' % len(not_in_redcap), form + ': Subjects Missing from Redcap', 'danger') h.showdataframe(not_in_redcap)
def plot_deviation_test(file_path, output_formats=["pdf"]): """ Plot the (absolute) effect of the cos(theta) cut on the distributions. """ # Read the input file log.debug("Reading file: {}".format(file_path)) reader = IOR.Reader(file_path) # Output info output_dir = "{}/plots".format(os.path.dirname(file_path)) base_name = os.path.basename(file_path).replace("_valdata.csv", "") log.debug("Output will be written to: {}".format(output_dir)) # Get the pandas dataframe for the cut histograms df = reader["Data"] scale_factor = VMCC.TestLumi * reader["CrossSection"] / reader["NTotalMC"] bin_centers = reader["BinCenters"] n_bins = len(bin_centers) n_dims = len(reader["CoordName"]) row_cut0 = df[(df["Delta-c"] == 0) & (df["Delta-w"] == 0)] N_cut_cut0 = np.array([row_cut0["C{}".format(b)] for b in range(n_bins)]) N_par_cut0 = np.array([row_cut0["P{}".format(b)] for b in range(n_bins)]) # Find the bin edges for each dimension bin_edges = [ np.linspace(reader["CoordMin"][d], reader["CoordMax"][d], reader["CoordNBins"][d] + 1) for d in range(n_dims) ] # Find the deltas and the minimum and maximum deviations deltas = IOPH.delta_pairs(df) delta_metrics = VMDH.delta_metric(deltas) colors = PHC.ColorSpectrum("turbo", -1.1 * np.amax(delta_metrics), 1.1 * np.amax(delta_metrics)) for dev_dir in tqdm(dev_directions, desc="Dev. dir. loop", leave=False): log.debug("Looking at direction {}".format(dev_dir.name)) dir_selection = dev_dir.func(deltas) dir_rows = df[dir_selection] dir_deltas = deltas[dir_selection] deltas_in_dir = delta_in_dir(dev_dir.name, dir_deltas) N_cut = np.array([dir_rows["C{}".format(b)] for b in range(n_bins)]) N_par = np.array([dir_rows["P{}".format(b)] for b in range(n_bins)]) diff_c0 = np.sqrt(scale_factor) * ratio(N_cut - N_cut_cut0, np.sqrt(N_cut_cut0)) diff_p0 = np.sqrt(scale_factor) * ratio(N_par - N_cut_cut0, np.sqrt(N_cut_cut0)) diff_pc = np.sqrt(scale_factor) * ratio(N_par - N_cut, np.sqrt(N_cut)) title = "{}, ${}$ab$^{{-1}}$".format(VTN.metadata_to_process(reader), VMCC.TestLumi / 1000) legend_title = "Shift {}\n$\Delta {}$ $[\delta={}]$".format( dev_dir.name, dev_dir.coord, reader["Delta"]) for d in tqdm(range(n_dims), desc="Dim. loop", leave=False): x = bin_centers[:, d] x_min, x_max = bin_edges[d][0], bin_edges[d][-1] coord_name = "${}$".format( VTN.name_to_coord(reader["CoordName"][d])) # definitions for the axes figsize = (12, 10) left, width = 0.17, 0.49 bottom, height = 0.12, 0.49 spacing = 0.005 rect_scatter = [left, bottom, width, height] rect_histx = [ left, bottom + height + spacing, width, 0.93 - (bottom + height + spacing) ] rect_histy = [ left + width + spacing, bottom, 0.93 - (left + width + spacing), height ] leg_pos = [(width + spacing) / width, (height + spacing) / height] # Common plotting arguments common_sc_kwargs = {"color": 'none', "linewidths": 2, "s": 10**2} common_hist_kwargs = {"histtype": 'step', "fill": False, "lw": 2} #--- Plot N_cut - N_cut0 ------------------------------------------------- # start with a rectangular Figure fig = plt.figure(figsize=figsize) fig.suptitle(title) #, fontsize=16) ax_scatter = plt.axes(rect_scatter) ax_scatter.tick_params(direction='in', top=True, right=True) ax_scatter.set_xlabel(coord_name) ax_scatter.set_ylabel( r"$\left(N_{cut}^{(\Delta c,\Delta w)} - N_{cut}^{0}\right)/\sqrt{N_{cut}^{0}}$" ) ax_histx = plt.axes(rect_histx) ax_histx.tick_params(direction='in', labelbottom=False) ax_histx.set_ylabel("$\sum_{bins} y^2$") ax_histy = plt.axes(rect_histy) ax_histy.tick_params(direction='in', labelleft=False) ax_histy.set_xlabel("#bins") y_bin_edges = np.linspace(np.amin(diff_c0), np.amax(diff_c0), 20) for row in range(len(dir_deltas)): y = diff_c0[:, row] color = colors[deltas_in_dir[row]] scatter = ax_scatter.scatter( x, y, edgecolors=color, marker=PHM.markers[row], label=r"${}$".format(deltas_in_dir[row] / reader["Delta"]), **common_sc_kwargs) ax_histx.hist(x, bins=bin_edges[d], weights=y**2, ec=color, **common_hist_kwargs) ax_histy.hist(y, bins=y_bin_edges, orientation='horizontal', ec=color, **common_hist_kwargs) ax_scatter.set_xlim((x_min, x_max)) ax_histx.set_xlim(ax_scatter.get_xlim()) ax_histy.set_ylim(ax_scatter.get_ylim()) ax_scatter.legend(loc=leg_pos, title=legend_title, ncol=2) # Save the figure in all requested formats for format in output_formats: format_dir = "{}/{}/DevCutCut0".format(output_dir, format) IOSH.create_dir(format_dir) dev_dir_name = dev_dir.name.replace(" ", "_") fig.savefig("{}/{}_{}_DevCutCut0_{}.{}".format( format_dir, base_name, reader["CoordName"][d], dev_dir_name, format)) plt.close(fig) #--- Only scatter plot N_cut - N_cut0 ------------------------------------ fig = plt.figure(figsize=(9, 7), tight_layout=True) ax_scatter = plt.gca() ax_scatter.set_title(title) ax_scatter.set_xlabel(coord_name) ax_scatter.set_ylabel( r"$\left(N_{cut}^{(\Delta c,\Delta w)} - N_{cut}^{0}\right)/\sqrt{N_{cut}^{0}}$" ) for row in range(len(dir_deltas)): scatter = ax_scatter.scatter( x, diff_c0[:, row], edgecolors=colors[deltas_in_dir[row]], marker=PHM.markers[row], label=r"${}$".format(deltas_in_dir[row] / reader["Delta"]), **common_sc_kwargs) ax_scatter.set_xlim((x_min, x_max)) ax_scatter.legend(title=legend_title, ncol=3) # Save the figure in all requested formats for format in output_formats: format_dir = "{}/{}/DevCutCut0".format(output_dir, format) IOSH.create_dir(format_dir) dev_dir_name = dev_dir.name.replace(" ", "_") fig.savefig("{}/{}_{}_DevCutCut0_ScatterOnly_{}.{}".format( format_dir, base_name, reader["CoordName"][d], dev_dir_name, format)) plt.close(fig) #--- Plot N_par - N_cut0 ------------------------------------------------- # start with a rectangular Figure fig = plt.figure(figsize=figsize) fig.suptitle(title) #, fontsize=16) ax_scatter = plt.axes(rect_scatter) ax_scatter.tick_params(direction='in', top=True, right=True) ax_scatter.set_xlabel(coord_name) ax_scatter.set_ylabel( r"$\left(N_{par}^{(\Delta c,\Delta w)} - N_{cut}^{0}\right)/\sqrt{N_{cut}^{0}}$" ) ax_histx = plt.axes(rect_histx) ax_histx.tick_params(direction='in', labelbottom=False) ax_histx.set_ylabel("$\sum_{bins} y^2$") ax_histy = plt.axes(rect_histy) ax_histy.tick_params(direction='in', labelleft=False) ax_histy.set_xlabel("#bins") y_bin_edges = np.linspace(np.amin(diff_p0), np.amax(diff_p0), 20) for row in range(len(dir_deltas)): y = diff_p0[:, row] color = colors[deltas_in_dir[row]] scatter = ax_scatter.scatter( x, y, edgecolors=color, marker=PHM.markers[row], label=r"${}$".format(deltas_in_dir[row] / reader["Delta"]), **common_sc_kwargs) ax_histx.hist(x, bins=bin_edges[d], weights=y**2, ec=color, **common_hist_kwargs) ax_histy.hist(y, bins=y_bin_edges, orientation='horizontal', ec=color, **common_hist_kwargs) ax_scatter.set_xlim((x_min, x_max)) ax_histx.set_xlim(ax_scatter.get_xlim()) ax_histy.set_ylim(ax_scatter.get_ylim()) ax_scatter.legend(loc=leg_pos, title=legend_title, ncol=2) # Save the figure in all requested formats for format in output_formats: format_dir = "{}/{}/DevParCut0".format(output_dir, format) IOSH.create_dir(format_dir) dev_dir_name = dev_dir.name.replace(" ", "_") fig.savefig("{}/{}_{}_DevParCut0_{}.{}".format( format_dir, base_name, reader["CoordName"][d], dev_dir_name, format)) plt.close(fig) #--- Plot N_par - N_cut -------------------------------------------------- # start with a rectangular Figure fig = plt.figure(figsize=figsize) fig.suptitle(title) #, fontsize=16) ax_scatter = plt.axes(rect_scatter) ax_scatter.tick_params(direction='in', top=True, right=True) ax_scatter.set_xlabel(coord_name) ax_scatter.set_ylabel( r"$\left(N_{par}^{(\Delta c,\Delta w)} - N_{cut}^{(\Delta c,\Delta w)}\right)/\sqrt{N_{cut}^{(\Delta c,\Delta w)}}$" ) ax_histx = plt.axes(rect_histx) ax_histx.tick_params(direction='in', labelbottom=False) ax_histx.set_ylabel("$\sum_{bins} y^2$") ax_histy = plt.axes(rect_histy) ax_histy.tick_params(direction='in', labelleft=False) ax_histy.set_xlabel("#bins") y_bin_edges = np.linspace(np.amin(diff_pc), np.amax(diff_pc), 20) for row in range(len(dir_deltas)): y = diff_pc[:, row] color = colors[deltas_in_dir[row]] scatter = ax_scatter.scatter( x, y, edgecolors=color, marker=PHM.markers[row], label=r"${}$".format(deltas_in_dir[row] / reader["Delta"]), **common_sc_kwargs) ax_histx.hist(x, bins=bin_edges[d], weights=y**2, ec=color, **common_hist_kwargs) ax_histy.hist(y, bins=y_bin_edges, orientation='horizontal', ec=color, **common_hist_kwargs) ax_scatter.set_xlim((x_min, x_max)) ax_histx.set_xlim(ax_scatter.get_xlim()) ax_histy.set_ylim(ax_scatter.get_ylim()) ax_scatter.legend(loc=leg_pos, title=legend_title, ncol=2) # Save the figure in all requested formats for format in output_formats: format_dir = "{}/{}/DevParCut".format(output_dir, format) IOSH.create_dir(format_dir) dev_dir_name = dev_dir.name.replace(" ", "_") fig.savefig("{}/{}_{}_DevParCut_{}.{}".format( format_dir, base_name, reader["CoordName"][d], dev_dir_name, format)) plt.close(fig)
def plot_chi_squared_test(file_path, output_formats=["pdf"]): # TODO TODO TODO DESCRIPTION # Read the input file log.debug("Reading file: {}".format(file_path)) reader = IOR.Reader(file_path) # Output info output_dir = "{}/plots".format(os.path.dirname(file_path)) base_name = os.path.basename(file_path).replace("_valdata.csv", "") log.debug("Output will be written to: {}".format(output_dir)) # Get the pandas dataframe for the cut histograms df = reader["Data"] n_bins = len(reader["BinCenters"]) row_cut0 = df[(df["Delta-c"] == 0) & (df["Delta-w"] == 0)] N_cut_cut0 = np.array( [row_cut0["C{}".format(b)].values[0] for b in range(n_bins)]) # Get scale factor to normalise distribution to the (roughly) number of events # expected during the fit scale_factor = VMCC.TestLumi * reader["CrossSection"] / reader["NTotalMC"] N_cut_cut0 *= scale_factor # Find the deltas deltas = IOPH.delta_pairs(df) delta_metrics = VMDH.delta_metric(deltas) # Maximum sqrt(dc**2 + dw**2) that should be included in chi-squared calc. # -> Don't use outermost test values, not bad if not exact fit there d_max = 2.0 * reader["Delta"] # Chi squared arrays for each dev dir chi_sq_pc = [] chi_sq_c0 = [] for dev_dir in dev_directions: log.debug("Looking at direction {}".format(dev_dir.name)) # Get the rows for this direction which fulfill the d_max criterium dir_selection = dev_dir.func(deltas) d_max_selection = delta_metrics <= d_max selection = np.logical_and(dir_selection, d_max_selection) dir_rows = df[selection] dir_deltas = deltas[selection] n_dev_points = len(dir_deltas) N_cut = np.array([dir_rows["C{}".format(b)] for b in range(n_bins)]) * scale_factor N_par = np.array([dir_rows["P{}".format(b)] for b in range(n_bins)]) * scale_factor diff_pc_sq = (N_par - N_cut)**2 diff_c0_sq = ((N_cut.transpose() - N_cut_cut0)**2).transpose() dir_chi_sq_pc = [] dir_chi_sq_c0 = [] # Calculate the chi-squared for each bin for d in range(n_dev_points): dev_chi_sq_pc = 0 dev_chi_sq_c0 = 0 diff_pc_sq_d = diff_pc_sq[:, d] diff_c0_sq_d = diff_c0_sq[:, d] N_cut_d = N_cut[:, d] for b in range(n_bins): if not N_cut_d[b] > 0: if abs(diff_pc_sq_d[b]) > 0: log.warning( "Bin {} at deviation ({}) has 0 for cut and non-0 for parametrisation" .format(b, dir_deltas[d])) elif np.all(N_cut[b] == N_cut_cut0[b]): # Skip bins that aren't affected by the cut # Their contribution to each chi^2 is zero anyway continue else: dev_chi_sq_pc += diff_pc_sq_d[b] / N_cut_d[b] dev_chi_sq_c0 += diff_c0_sq_d[b] / N_cut_cut0[b] dir_chi_sq_pc.append(dev_chi_sq_pc) dir_chi_sq_c0.append(dev_chi_sq_c0) chi_sq_pc.append(dir_chi_sq_pc) chi_sq_c0.append(dir_chi_sq_c0) # --- Plotting --------------------------------------------------------------- # start with a rectangular Figure fig = plt.figure(figsize=(7.5, 6), tight_layout=True) ax_scatter = plt.gca() title = "{}, ${}$ab$^{{-1}}$".format(VTN.metadata_to_process(reader), VMCC.TestLumi / 1000) ax_scatter.set_title(title) ax_scatter.set_xlabel(r"$\chi_{shift}^{2}$", fontsize=26) ax_scatter.set_ylabel(r"$\chi_{mismodel}^{2}$", fontsize=26) # ax_scatter.set_xlabel(r"$\chi_{shift}^{2} = \sum_{bins} \left(\frac{N_{cut}^{(\Delta c, \Delta w)} - N_{cut}^{0}}{\sqrt{N_{cut}^{0}}}\right)^2$") # ax_scatter.set_ylabel(r"$\chi_{par}^{2} = \sum_{bins} \left(\frac{N_{par}^{(\Delta c, \Delta w)} - N_{cut}^{(\Delta c, \Delta w)}}{\sqrt{N_{cut}^{(\Delta c, \Delta w)}}}\right)^2$") # Set logarithmic axes x_min = min([min(c) for c in chi_sq_c0]) x_max = max([max(c) for c in chi_sq_c0]) y_min = min([min(c) for c in chi_sq_pc]) y_max = max([max(c) for c in chi_sq_pc]) edge_min = 0.5 * y_min edge_max = 1.5 * max(x_max, y_max) log_edge_min = np.log10(edge_min) log_edge_max = np.log10(edge_max) edges = np.logspace(log_edge_min, log_edge_max, 16) ax_scatter.set_yscale('log') ax_scatter.set_xscale('log') ax_scatter.set_ylim(edge_min, edge_max) ax_scatter.set_xlim(edge_min, edge_max) # Draw diagonal axis line, everything below that line is fine ax_scatter.fill_between(edges, edges, edge_max * np.ones(16), color='red', alpha=0.5) ax_scatter.axline((edge_min, edge_min), (edge_max, edge_max), ls='--', color='black') colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] for i_dir in range(len(dev_directions)): scatter = ax_scatter.scatter(chi_sq_c0[i_dir], chi_sq_pc[i_dir], color='none', ec=colors[i_dir], lw=2, s=10**2, marker=PHM.markers[i_dir], label=dev_directions[i_dir].name) legend_title = r"$\cos\theta_{{\mu}}^{{cut}}={}$,".format( reader["Coef|MuonAcc_CutValue"] ) + "\n" + r"$\sqrt{{\Delta c^2 + \Delta w^2}} \leq {}\delta$".format( d_max / reader["Delta"]) ax_scatter.legend(title=legend_title, loc="upper left") # Save the figure in all requested formats for format in output_formats: format_dir = "{}/{}/ChiSquared".format(output_dir, format) IOSH.create_dir(format_dir) fig.savefig("{}/{}_ChiSquared.{}".format(format_dir, base_name, format)) plt.close(fig)
import pandas as pd import PandasHelper as pdh from PandasHelper import FileType import re import pandasql as pdsql path = 'C:\\Users\\NickTsai\\Desktop\\PandaPractice\\pokemon_data.json' path2 = 'C:\\Users\\NickTsai\\Desktop\\PandaPractice\\pokemon_data.xlsx' df = pdh.read_file_as_datafram(file_path=path, file_type=FileType.Json) dfs = pdh.read_file_as_datafram(file_path=path2, file_type=FileType.Excel) # datafram filtering # contains df2 = df.loc[(df['Type 1'] == 'Grass') & (df['Name'].str.contains('Mega'))] # not contains df3 = df.loc[(df['Type 1'] == 'Grass') & (~df['Name'].str.contains('Mega'))] # datafram index reset df2.reset_index(drop=True, inplace=True) # sorting # one condtion df4 = df.sort_values(['Name'], ascending=False) # Multi condition sorting df5 = df.sort_values(['Name', 'Type 1'], ascending=True) # Regular expression df6 = df.loc[(df['Type 1'].str.contains('fire|grass', flags=re.I, regex=True))] df7 = df.loc[(df['Name'].str.contains('^pi[a-z]*', flags=re.I, regex=True))]