def image_processor(input_folder, image_name, output_path, save_file=False): logger.info(f'Processing ROI pixels from {image_name}') results_path = input_folder + image_name + '_cells_pixels.csv' ROI_pixels = pixel_cleaner(results_path, image_name) ROI_pixels.rename(columns={'cells': 'Intensity'}, inplace=True) logger.debug(ROI_pixels.columns.tolist()) # Repeat for nuclear pixel info nuclear_path = input_folder + image_name + '_nuclei_pixels.csv' logger.info(f'Processing nuclear pixels from {image_name}') nuclei_pixels = pixel_cleaner(nuclear_path, image_name) nuclei_pixels.rename(columns={'nuclei': 'Intensity'}, inplace=True) logger.debug(nuclei_pixels.columns.tolist()) # Filter ROI_pixels to remove any pixels that are found in the nuclear pixels list logger.info(f'Filtering nuclear pixels from ROI pixels for {image_name}') cyto_pixels = ROI_pixels[~ROI_pixels['X,Y'].isin(nuclei_pixels['X,Y'])] logger.debug(cyto_pixels.columns.tolist()) # Collect nuclear pixels that were excluded from whole cell pixels as nuclear pixels for each cell nuclear_pixels = ROI_pixels[~ROI_pixels['X,Y'].isin(cyto_pixels['X,Y'])] if save_file: # Save to excel file: logger.info(f'Saving individual excel files for {image_name}') FileHandling.df_to_excel( output_path + f'{image_name}_pixel_filtered.xlsx', ['cytoplasm', 'nucleus', 'whole_cell'], data_frames=[cyto_pixels, nuclear_pixels, ROI_pixels]) logger.info( f'Pixels processed successfully for {image_name}! Results saved to {output_path}' ) return { 'cyto': cyto_pixels, 'nuclei_pixels': nuclei_pixels, 'nucleus': nuclear_pixels, 'whole_cell': ROI_pixels }
# add columns normalised to t0 intensity before_after['norm_Intensity_before'] = before_after[ 'Intensity_before'] / before_after['Intensity_before'] before_after['norm_Intensity_after'] = before_after[ 'Intensity_after'] / before_after['Intensity_before'] # Collect useful columns, group before/after by mutant and save to excel mutant_summary = {} for group, df in before_after.groupby('mutant'): mutant_summary[group] = df[[ 'track_id', 'ROI_name_before', 'Intensity_before', 'Intensity_after', 'norm_Intensity_before', 'norm_Intensity_after' ]] FileHandling.df_to_excel(f'{output_path}first_last_summary.xlsx', sheetnames=list(mutant_summary.keys()), data_frames=list(mutant_summary.values())) # melt to produce data for seaborn plotting normalised intensity plottable = before_after[[ 'mutant', 'track_id', 'norm_Intensity_before', 'norm_Intensity_after' ]].melt(id_vars=['mutant', 'track_id'], var_name='Activation', value_name='Intensity') plottable['Activation'] = plottable['Activation'].str.split('_').str[-1] #fig = scatbar_plot.scatbar_plot(x_col='mutant', y_col='Intensity', plotting_dfs=[plottable], hue_col='Activation', group_col='mutant') #plt.savefig(f'{output_path}norm_first_last.png') # melt to produce data for seaborn plotting raw intensity
fluo_data = raw_data.copy()[['phase', 'fluorescence']] fluo_data = fluo_data.groupby('phase').median().T fluo_data['sample'] = sample_name sample_data.append(fluo_data) summary_df = pd.concat(sample_data).reset_index(drop=True) summary_df['plate'] = summary_df['sample'].str[0] summary_df['well'] = summary_df['sample'].str[1:] summary_df['sample'] = summary_df['well'].map(sample_map) summary_df.sort_values(['sample'], inplace=True) FileHandling.df_to_excel( data_frames=[summary_df], sheetnames=['fluorescence_per_phase'], output_path=f'{output_folder}per_phase_median_TPE.xlsx') # Generate equivalent dataset, ignoring phase sample_data = {} for filename in file_list: sample_name = os.path.splitext(filename)[0] raw_data = pd.read_csv(f'{input_folder}{filename}') raw_data.rename(columns={fluorescence_col: "fluorescence"}, inplace=True) fluo_data = raw_data.copy()['fluorescence'] sample_data[sample_name] = fluo_data.median() summary_df = pd.DataFrame.from_dict(sample_data, orient='index').reset_index() summary_df.rename(columns={
}) cleaned_codes['length'] = cleaned_codes['code'].astype(str).apply(len) level_map = {2: 1, 4: 2, 5: 3, 6: 3} cleaned_codes['level'] = cleaned_codes['length'].map(level_map) cleaned_codes['code'] = cleaned_codes['code'].astype(str) cleaned_codes['code'] = [ '0' + code if len(code) == 5 else code for code in cleaned_codes['code'] ] cleaned_codes['parent_field_code'] = cleaned_codes['code'].str[0:2] cleaned_codes['parent_group_code'] = [ code if len(code) == 4 else np.nan for code in cleaned_codes['code'].str[0:4] ] FileHandling.df_to_excel(data_frames=[cleaned_codes.drop('length', axis=1)], sheetnames=['FOR_summary'], output_path=f'{output_folder}for_summary.xlsx') cleaned_codes.tail(50) cleaned_codes.to_csv(f'{output_folder}for_summary.csv') # split on ' ' - any 6-digit codes will not be affected, whereas 4-digit codes will have two elements # for all codes, keep element[0] and for four digit codes return map into 'description' column # Generate mapped column for 2-digit and 4-digit codes (by splitting number, then collecting first two or first four digits?) # Generate reverse dictionary mapping description to each set of codes (two, four or six-digit) # Import summary data for field of research
def scattbar_plotter(summary_df, group_xcol, y_col, hue_col, output_path, order=None, hue_order=None): fig, ax = plt.subplots() br = sns.barplot(x=group_xcol, y=y_col, data=summary_df, hue=hue_col, dodge=True, errwidth=1.25, alpha=0.25, ci=None, order=order, hue_order=hue_order, ax=ax) scat = sns.swarmplot(x=group_xcol, y=y_col, data=summary_df, hue=hue_col, dodge=True, order=order, hue_order=hue_order, ax=ax) leg_label = hue_col # To generate custom error bars if not order: order = list(summary_df[group_xcol].unique()) logger.debug(f'Samples: {order}') if hue_order: hue_map = dict(zip(hue_order, range(len(hue_order)))) summary_df['hue_pos'] = summary_df[hue_col].map(hue_map) hue_col = 'hue_pos' number_groups = len(list(set(summary_df[hue_col]))) logger.debug(f'Number of groups: {number_groups}') bars = br.patches xvals = [(bar.get_x() + bar.get_width() / 2) for bar in bars] xvals.sort() yvals = summary_df.groupby([group_xcol, hue_col]).mean().T[order].T[y_col] yerr = summary_df.groupby([group_xcol, hue_col]).std().T[order].T[y_col] (_, caps, _) = ax.errorbar(x=xvals, y=yvals, yerr=yerr, capsize=4, elinewidth=1.25, ecolor="black", linewidth=0) for cap in caps: cap.set_markeredgewidth(2) ax.set_ylabel(y_col) # To only label once in legend handles, labels = ax.get_legend_handles_labels() ax.legend(handles[0:number_groups], labels[0:number_groups], bbox_to_anchor=(1.26, 1.05), title=leg_label) # rotate tick labels for label in ax.get_xticklabels(): label.set_rotation(45) ax.set_xlabel(group_xcol) plt.tight_layout() plt.autoscale() #plt.show() FileHandling.fig_to_pdf([fig], output_path + f'_scattbar') FileHandling.fig_to_svg([f'scattbar'], [fig], output_path) return fig
metrics['fwci_awarded'] = metrics['fwci_awarded'].apply(value_checker) metrics['pubs_awarded'] = metrics['pubs_awarded'].apply(value_checker) # Collect datapoints per year pubs_list = {} fwci_list = {} for year, df in metrics.groupby(['Year']): for level, data in df.groupby('type_cat'): pubs_list[f'{year}_{level}'] = list(data['pubs_awarded']) fwci_list[f'{year}_{level}'] = list(data['fwci_awarded']) # Generate separate dataframes pubs = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in pubs_list.items()])) fwci = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in fwci_list.items()])) FileHandling.df_to_excel(data_frames=[pubs, fwci], sheetnames=['pubs', 'fwci'], output_path=f'{output_folder}metrics_per_year.xlsx') # Collect cols for each level levels = ['_1', '_2', '_3'] for level in levels: cols = [col for col in pubs.columns.tolist() if level in col] test_df = pubs[cols].melt(value_name='publications', var_name='Group') # Two-way ANOVA aov = pg.anova(data=test_df, dv='publications', between='Group', export_filename=f'{output_folder}anova_pubs{level}.csv') pg.print_table(aov) # FDR-corrected post hocs with Hedges'g effect size
'object_number', 'mutant', 'image_number' ] compiled_spots = compiled_raw_data[cols_of_interest] compiled_spots.columns = new_col_names # Save to csv compiled_spots.to_csv(f'{output_folder}compiled_spots.csv') # Calculate interesting bits ## mean of area and mean_intensity spot_calculations = compiled_spots.groupby( ['mutant', 'image_number']).mean()[['area', 'mean_intensity']].reset_index() ## count per cell spot_count = compiled_spots.groupby(['mutant', 'image_number' ]).count()['object_number'].reset_index() spot_min = compiled_spots.groupby(['mutant', 'image_number' ]).min()['min_intensity'].reset_index() # concat into final summary df spot_summary = reduce( lambda df1, df2: pd.merge( df1, df2, on=['mutant', 'image_number'], how='outer'), [spot_calculations, spot_count, spot_min]) spot_summary.rename(columns={'object_number': 'count'}, inplace=True) FileHandling.df_to_excel(f'{output_folder}spot_summary.xlsx', sheetnames=['spot_summary'], data_frames=[spot_summary])
# Generate compiled summary_df for all images, after adding descriptive nuc_id to each ROI summary_list = [] for key, summary_df in cyto_summary.items(): # Add additional info from image names summary_df = summary_df.dropna() summary_df['mutant'] = summary_df['image_name_mCherry'].str.split('_').str[0] summary_df['position'] = summary_df['image_name_mCherry'].str.split('_').str[2] summary_df['number'] = [str(number) for number in range(0, summary_df.shape[0])] summary_df['cell_id'] = list(zip(summary_df['mutant'], summary_df['position'], summary_df['number'])) summary_df['cell_id'] = ['_'.join(tuple) for tuple in summary_df['cell_id']] summary_dict[key]['cyto'] = summary_df summary_list.append(summary_df) mean_cell_summary = pd.concat(summary_list) FileHandling.df_to_excel(output_path+f'mean_cell_summary.xlsx', sheetnames=['summary'], data_frames=[mean_cell_summary.reset_index()]) # # Checking out the summary results # pixel_dict.keys() # shows the image names that were processed i.e. key 1 # pixel_dict['barnaseWT+Q25_pos_001_t_0']['cyto'].keys() # Shows the available dataframes for each image i.e. key 2 # pixel_dict['Ex2_pos_1_t_0']['nuclei_pixels'] # shows the cytoplasm pixel summary for the first image # summary_dict.keys() # summary_dict['Ex2_pos_1_t_0'].keys() # # summary_dict['barnaseWT+Q25_pos_001_t_0']['cyto'] # plot individual image for t0 to test segmentation with ROI name overlayed for image_name in [name for name in image_names if 't_0' in name]: pixel_cyto = pixel_dict[image_name]['cyto']
filename for filename in os.listdir(input_folder) if 'Scholarly_Output' in filename ] # Read in raw scival info, clean pubs_raw = [] for filename in file_list: pubs_raw.append(pd.read_excel(f'{input_folder}{filename}', skiprows=12)) pubs_raw = pd.concat(pubs_raw) pubs_clean = pubs_raw.rename(columns={'Unnamed: 0': 'name'}) pubs_clean = pubs_clean.drop( [col for col in pubs_clean if 'Unnamed' in str(col)], axis=1) pubs_clean.set_index('name', inplace=True) pubs_clean.head(20) pubs_clean.loc['Jenkins, Misty R.'] # count number of pubs 10 years before award pubs = {} # test_dict = {k: v for k, v in random.sample(search_dict.items(), 10)} for name, year in search_dict.items(): past_range = np.arange(year - 10, year + 1, 1) pubs[name] = pubs_clean.loc[name, past_range].sum() awardees['pubs_awarded'] = awardees['match_name'].map(pubs) # Save completed mapping to excel FileHandling.df_to_excel( data_frames=[awardees], sheetnames=['fwci_pubs'], output_path=f'{output_folder}ten_year_metrics_summary.xlsx')
types = ['nucleus', 'cyto'] cell_summary = pd.DataFrame(index=[ 'image_name', 'pixel_type', 'mean', 'median', 'std', 'mode', 'geomean' ]) for image, dictionary_1 in filtered_pixels.items(): for pixel_type, df in dictionary_1.items(): if pixel_type in types: logger.info(f'Processing {pixel_type} for {image}.') logger.info(f'Original shape: {df.shape}') df = df[df['mCherry_intensity'] < mCherry_threshold] logger.info(f'New shape: {df.shape}') # calculate relevant info stats = [ image, pixel_type, df['GFP_intensity'].mean(), df['GFP_intensity'].median(), df['GFP_intensity'].std(), df['GFP_intensity'].mode()[0], geomean(list(df['GFP_intensity'])) ] cell_summary[f'{image}_{pixel_type}'] = stats # Clean up stats df cell_summary = cell_summary.T.reset_index(drop=True) # Add column for the mutant, replicate cell_summary['mutant'], cell_summary['replicate'] = cell_summary[ 'image_name'].str.split('_').str # Save summary to excel FileHandling.df_to_excel(output_path + f'GFP_stats_summary.xlsx', sheetnames=['GFP_summary'], data_frames=[cell_summary])
proportion = round(sample_dict['proportions'][phase], 2) plt.annotate(f'{phase}:{proportion}', (0.2, phase_pos[x]), xycoords='figure fraction') plt.legend() plt.title(sample_name) plt.ylabel('Count') plt.savefig(f'{output_folder}plots/{sample_name}_predict.png') data_dict[sample_name] = sample_dict # Save cluster-labelled df to csvs for sample in data_dict.keys(): data = data_dict[sample]['normalised'] data.to_csv(f"{output_folder}normalised/{sample}.csv") # Collect proportions into simple df labelled with sample name summary = pd.DataFrame(index=data_dict.keys(), columns=['G', 'S', 'M']) for sample in data_dict.keys(): proportions = data_dict[sample]['proportions'] summary.loc[sample, 'G'] = proportions['G'] summary.loc[sample, 'S'] = proportions['S'] summary.loc[sample, 'M'] = proportions['M'] summary.rename_axis('sample', inplace=True) summary.reset_index(inplace=True) FileHandling.df_to_excel(data_frames=[summary], sheetnames=['phase_proportion'], output_path=f'{output_folder}summary.xlsx')
test_data['Year'] = year test_data.drop(grant_cols, axis=1, inplace=True) test_data.rename(columns={name_col: 'CIA_name'}, inplace=True) name_summary.append(test_data) name_summary = pd.concat(name_summary).reset_index(drop=True) name_summary['name'] = name_summary['CIA_name'].str.split( ' ').str[1:].str.join(' ') type_mapper = {'ECF': 1, 'CDF': 2, 'RF': 3, 'L': 3, 'EL2': 2, 'EL1': 1} name_summary['type_cat'] = name_summary['Type'].map(type_mapper) # Save all to excel data_frames = [ awarded_summary, gender_summary, title_summary, title_proportions, research_summary, research_proportions, state_summary, institute_summary, kw_summary, name_summary ] sheetnames = [ 'total_rates', 'per_gender', 'CIA_title', 'title_proportion_per_year', 'field_of_research', 'broad_research_proportions', 'state_summary', 'institute_summary', 'key_word_summary', 'name_summary' ] FileHandling.df_to_excel(data_frames=data_frames, sheetnames=sheetnames, output_path=f'{output_folder}summary_stats.xlsx')
day_cleaning['Plate #'] = day_cleaning['Well #'].str.split('_', expand=True)[0] day_cleaning['Treatment #'] = day_cleaning['Well #'].str.split( '_', expand=True)[2] day_cleaning['Plate_coords'] = day_cleaning['Plate #'].map( str) + '_' + day_cleaning['Treatment #'].map(str) day_cleaning['day'] = x + 1 day_cleaning['sample'] = day_cleaning['Plate_coords'].map(sample_map) day_dict[day_name] = day_cleaning summary_df = pd.concat(day_dict.values()) FileHandling.df_to_excel(data_frames=[summary_df], sheetnames=['summary'], output_path=f'{output_path}summary.xlsx') # Generate data for plotting plotting_plates = [] for group, data in summary_df.groupby('Plate #'): plot_df = data[['Well #', 'sample', 'day', '%Area']].reset_index() # Exclude unused wells to allow easier plotting plot_df = plot_df[plot_df['sample'] != 'Unused'] plot_df['Hue position'] = plot_df['day'].astype("category").cat.codes plotting_plates.append(plot_df) # Generate single plot with each plate dimensions = len(set(summary_df['Plate #'])) fig, axes = plt.subplots(dimensions, 1, figsize=(8, dimensions * 3))