def multifile_hist(input_folder, output_path, sample_name): logger.info('Input Folder detected: {}'.format(input_folder)) #find all files that contain Compiled in name compiled_files = glob.glob(input_folder + '/*Compiled.xlsx') #if there are no Compiled files, generate them if not compiled_files: FileHandling.PD_compiler(input_folder) #redefine compiled_files to include newly generated compiled_files = glob.glob(input_folder + '/*Compiled.xlsx') #iterate through all compiled_files, creating and saving figures figs = [] figures = {} for file in compiled_files: logger.info(file) fig_dict = PlotUtils.pep_abund_hist(file) figs.append(fig_dict) #save figs to pdf for fig_dict in figs: figures.update(fig_dict) FileHandling.fig_to_pdf(figures, output_path, fig_type=sample_name + 'PepAbundHist') logger.info(f"Figures saved to {output_path}") return figures
def main(input_path, output_path, sample_name): logger.info('Input path: {}'.format(input_path)) if not os.path.isdir(output_path): os.mkdir(output_path) if os.path.isdir(input_path): figures = multifile_hist(input_path, output_path, sample_name) else: figures = PlotUtils.pep_abund_hist(input_path) FileHandling.fig_to_pdf(figures, output_path, fig_type=sample_name + 'PepAbundHist') logger.info(f"Figures saved to {output_path}") return figures
def main(input, output_path, sample_name, x_vals=None, test_elements=None, group_col=None, element_col=None, svg=None): pass logger.info(f"Analysing: {sample_name}") if not os.path.isdir(output_path): os.mkdir(output_path) logger.info(f"Output directory made at {output_path}") if isinstance(input, pd.DataFrame): logger.info('DataFrame input detected') raw_data = input elif os.path.isfile(input): logger.info(f'Input file being loaded from {input}') raw_data = pd.read_excel(input) else: logger.info(f"Incorrect input format detected. Please pass full file path, or a dataframe as input.") # Collects index column for plot names by default if not element_col: element_col = raw_data.index.tolist() logger.info(f'No element column detected. Using index instead.') #If a grouping column is passed (e.g. proteins) then all elements for that group are plotted together. Otherwise, individual elements are plotted separately, and are expected the be the index if group_col: col_for_dict = group_col else: col_for_dict = element_col per_element_dict = {} for element in raw_data[col_for_dict].unique(): per_element_dict[element] = raw_data[raw_data[col_for_dict] == element] #If a test element is passed to function, then only produce the plot for that element, otherwise fit whole df if test_elements: data_for_fitting = {k: per_element_dict[k] for k in test_elements} else: data_for_fitting = per_element_dict #Create the figure for each group or element figure_dict = {} for key, value in per_element_dict.items(): fig = per_protein_fitter(value, x_vals, group_col, element_col) fig.suptitle(key) figure_dict[key] = fig plt.show(fig) FileHandling.fig_to_pdf(figure_dict, output_path=output_path+sample_name, fig_type='_Sigmoids') if svg: FileHandling.fig_to_svg(fig_names=list(figure_dict.keys()), fig_list=list(figure_dict.values()), output_path=output_path+sample_name) logger.info(f'Figures saved to {output_path}') return figure_dict
def main(input_path, output_path, sample_name): """ Master function to apply a list of functions to the input file, generating urea denaturation curve for each protein Parameters: input_path: string input path for the file to be processed output_path: string output path for which any output generated by functions will be saved sample_name: string sample name associated with the file to be processed. Returns: summary_table: DataFrame dataframe containing the summarised output of the functions applied in order """ #av_summary = do_funcs(input_path, output_path, sample_name) logger.info('Input Path: {}'.format(input_path)) logger.info(f'Preparing to process {sample_name}....') total_data = FileHandling.file_reader(input_path) quant_data, col_list = DataWrangling.quantified_data(total_data) two_unique_cys, cys_pep, non_cys_pep = DataWrangling.Unique_Cys_sorter( quant_data) #set index of summary dataframes to the protein accession cys_pep = cys_pep.set_index(["Master Protein Accessions"], drop=False) non_cys_pep = non_cys_pep.set_index(["Master Protein Accessions"], drop=False) non_cys_Av = CalcUtils.non_cys_AR(cys_pep, non_cys_pep) summary_table = CalcUtils.cys_div_noncys(cys_pep, non_cys_Av, col_list) #Saving all dataframes so far to excel results document data_frames = [ total_data, quant_data, two_unique_cys, cys_pep, non_cys_pep, summary_table ] sheetnames = [ 'Total Data', 'Quant Data', 'TwoUniqueCYS', 'CysPep', 'NonCysPep', 'Summary Table' ] FileHandling.df_to_excel(output_path, sheetnames, data_frames) #collect only columns of interest summary_table.reset_index(drop=True, inplace=True) ratio_col = [col for col in summary_table.columns if '_Cys/NonCys' in col] select_col = ['Master Protein Accessions', 'Annotated Sequence' ] + ratio_col summary_data = summary_table[select_col] logger.debug(summary_data) #rename columns to simple names summary_data = summary_data.rename(columns={ 'Master Protein Accessions': 'ProteinID', 'Annotated Sequence': 'Sequence' }) #for peptides seen more than once in a sample, take average ratio to give only unique ratios for each peptide av_summary = CalcUtils.single_element_av(summary_data, 'Sequence') ##Filtering for proteins which have too many missing values, and generating plots. logger.info('Filtering for missing values...') #removing rows with >thresh Nans filtered_consensus = DataWrangling.filter_NaNs(av_summary, filter_type='total', threshold=0) #preparing variables and plotting scatter for each protein logger.info('Creating scatter plots...') urea_conc = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6] fig_dict = PlotUtils.multirow_scatter(filtered_consensus, key='ProteinID', col_head='Sequence', x_vals=urea_conc, x_label='Urea Conc', y_label='Cys_NonCys') #to save all figures to pdf FileHandling.fig_to_pdf(fig_dict, output_path + 'Thresholded_') logger.info('Save figs to pdf complete') #to show all figures as output, for protein, figure in fig_dict.items(): plt.show(figure) Threshold_0 = filtered_consensus dfs = [Threshold_0] sheetnames = ['Total_0'] FileHandling.df_to_excel(output_path=output_path + 'Thresholded_', sheetnames=sheetnames, data_frames=dfs) return summary_data
def main(input_path, output_path, sample_name, do_plots=True): """ Master function to apply a list of functions to the input file Parameters: input_path: string input path for the file to be processed output_path: string output path for which any output generated by functions will be saved sample_name: string sample name associated with the file to be processed. Returns: summary_table: DataFrame dataframe containing the summarised output of the functions applied in order """ logger.info(f'Preparing to process: {sample_name}') logger.info(f"Input Path: {input_path}") total_data = FileHandling.file_reader(input_path) quant_data, col_list = DataWrangling.quantified_data(total_data) #raj only considers peptides that are observed in all replicates quant_data = quant_data.dropna(axis=0, how='any', thresh=None, subset=col_list) two_unique_cys, cys_pep, non_cys_pep = DataWrangling.Unique_Cys_sorter( quant_data) #set index of summary dataframes to the protein accession cys_pep = cys_pep.set_index(["Master Protein Accessions"], drop=False) non_cys_pep = non_cys_pep.set_index(["Master Protein Accessions"], drop=False) non_cys_Av = CalcUtils.non_cys_AR(cys_pep, non_cys_pep) summary_table = CalcUtils.cys_div_noncys(cys_pep, non_cys_Av, col_list) #collect list of columns for each type of ratio summary_table.reset_index(drop=True, inplace=True) abundance_cols = [ col for col in summary_table.columns if 'Abundance Ratio: (' in col ] ratio_col = [col for col in abundance_cols if '_Cys/NonCys' in col] logger.info(f"Collecting ratio columns: {ratio_col}") NC_col = [col for col in abundance_cols if '_NC' in col] logger.info(f"Collecting NonCys columns: {NC_col}") C_col = [col for col in abundance_cols if ')_' not in col] logger.info(f"Collecting Cys columns: {C_col}") #collect only columns of interest for summary table select_col = ['Master Protein Accessions', 'Annotated Sequence' ] + abundance_cols summary_data = summary_table[select_col] summary_data.dropna(axis=0, how='any', thresh=None, subset=abundance_cols, inplace=True) summary_data = summary_data.reset_index(drop=True) logger.debug(F"Summary data acquired: {summary_data}") #rename columns to simple names summary_data = summary_data.rename(columns={ 'Master Protein Accessions': 'ProteinID', 'Annotated Sequence': 'Sequence' }) #####Paired T-test logger.info("Calculating t-test statistics") summary_data = CalcUtils.t_test_pair(summary_data, C_col, NC_col) #####Include -log10 of p-value summary_data['-Log10 p-Value'] = -np.log10(summary_data['p-value']) #####Include Log2 of Average ratios summary_data = CalcUtils.row_mean(summary_data, ratio_col, 'Av. Ratio') summary_data['Log2 Average Ratio'] = np.log2(summary_data['Av. Ratio']) ##### Include average col for NC summary_data = CalcUtils.row_mean(summary_data, NC_col, 'NC Average') summary_data['Log2 Average NC'] = np.log2(summary_data['NC Average']) #####Assign colour column summary_data = DataWrangling.colour_column(summary_data, '-Log10 p-Value', 'p-value colour') logger.info("Completed calculations with summary data table") logger.debug(f"Summary data table post calculations: {summary_data}") #Saving all dataframes so far to excel results document data_frames = [ total_data, quant_data, two_unique_cys, cys_pep, non_cys_pep, summary_table, summary_data ] sheetnames = [ 'Total Data', 'Quant Data', 'TwoUniqueCYS', 'CysPep', 'NonCysPep', 'Summary Info', 'Summary Data' ] FileHandling.df_to_excel(output_path=output_path + sample_name + '_Foldedness_', sheetnames=sheetnames, data_frames=data_frames) logger.info("All dataframes saved to {output_path}") if do_plots: logger.info(f"Preparing foldedness scatterplot for {sample_name}") figures = {} figures['Foldedness Scatter'] = foldedness_scatter( summary_data, sample_name) #### Save figs to pdf FileHandling.fig_to_pdf(figures, output_path=output_path, fig_type=sample_name + '_Foldedness') logger.info(f"Figures saved to {output_path}") for key, value in figures.items(): plt.show(value) return summary_data
def main(input_path, output_path, sample_name, sample_type='whole_cell', replicate_threshold=0, simple=True, interactive=True, Bokeh_plot=True): logger.info(f"Analysing: {sample_name}") if not os.path.isdir(output_path): os.mkdir(output_path) ## COLLECTING PEPTIDE ABUNDANCES FOR NORMALISATION ## # opening sheet by calling sheet_reader on path sheetname = 'Peptides' peptides_raw = pd.read_excel(input_path, sheetname) logger.info(f"Peptide data collected from {input_path}") logger.info(f"Number of peptides detected: {peptides_raw.shape[0]}") #collect list of columns containing abundance Ratios col_list = [ col for col in peptides_raw.columns if 'Abundance Ratio: (' in col ] logger.info(f"Columns detected for analysis: {col_list}") # calculating mean and median for pep abundance using mean_median_calc function calcs_dict = {} for column in col_list: vals = peptides_raw[column].dropna() mean_val = np.mean(vals) median_val = np.median(vals) calcs_dict[column] = [mean_val, median_val] logger.info(f"Med. and Mean calculated for {column}") # convert calcs to dataframe and set column labels calcs = pd.DataFrame.from_dict(calcs_dict, orient='index') calcs.columns = ['Mean', 'Median'] calcs = calcs.sort_index() median_list = calcs['Median'] mean_list = calcs['Mean'] logger.info(f"Median: {median_list}") logger.info(f"Mean: {mean_list}") ## COLLECTING PROTEIN ABUNDANCES FOR VOLCANO PLOT ## # opening all Protein excel sheets by calling sheet_reader on path sheetname = 'Proteins' proteins_raw = pd.read_excel(input_path, sheetname) logger.info(f"Proteins imported from {input_path}") # creating summary data_frame of original protein data summary_cols = ['Accession', 'Description'] + col_list logger.info(f"Columns for summary: {summary_cols}") protein_AR_summary = proteins_raw[summary_cols] # remove any proteins not seen in all replicates protein_AR_summary = DataWrangling.filter_NaNs( protein_AR_summary, filter_type='total', threshold=replicate_threshold) protein_AR_summary.reset_index(inplace=True, drop=True) logger.info(f"Protein AR: {protein_AR_summary.head(5)}") # Normalising each dataset to the Median Peptide Abundance protein_NormAR = protein_AR_summary.copy() for col in col_list: protein_NormAR[col] = protein_AR_summary[col] / median_list[col] logger.info( f"Protein abundances normalised to median peptide abundance: {protein_NormAR.head(5)}" ) # If IP sample, take the Log2 of each sample for t-tests if sample_type == "IP": logger.info(f'{sample_type} sample detected.') protein_Log2 = protein_NormAR.copy() for col in col_list: protein_Log2[col] = np.log2(protein_NormAR[col]) logger.info( f"Log2 of Protein normalisaed abundances calculated: {protein_Log2.head(5)}" ) else: logger.info( f'{sample_type} sample detected. Using normalised abundances for one sample t-test' ) # Complete one-sample t-test on each row of NormProtAR using t-test_1samp function if sample_type == 'whole_cell': popmean = 1 df = protein_NormAR elif sample_type == 'IP': popmean = 0 df = protein_Log2 logger.info( f"Calculating One Sample t-test with population mean {popmean}") df = CalcUtils.t_test_1samp(df, popmean, col_list) # Calculating the average abundance ratio logger.info(f"Calculating mean normalised Abundance Ratio...") df = CalcUtils.row_mean(df, col_list, 'Average') # Appending other columns of interest for the volcano plot # A volcano plot is constructed by plotting the negative log # of the p value on the y axis (usually base 10). This results # in data points with low p values (highly significant) appearing toward the top of the plot. df['Log10 p-val'] = -(np.log10(df['p-value'])) if sample_type == 'whole_cell': logger.info( f"Calculating Log2 Average normalised Abundance Ratio, and -Log10(p-value)..." ) df['Log2 Av AR'] = np.log2(df['Average']) elif sample_type == 'IP': logger.info( f"Calculating Average Log2 normalised Abundance Ratio, and -Log10(p-value)..." ) df['Log2 Av AR'] = protein_Log2['Average'] # To produce the colour column, change x and y limits in original function logger.info(f"Producing colour column...") xcol = 'Log2 Av AR' ycol = 'Log10 p-val' df = CalcUtils.colour_column_volc(df, xcol, ycol) logger.info(f"Post calculation results: {df.head(5)}") # Collecting dataframes and descriptors to save using the df_to_excel function data_frames = [calcs, protein_AR_summary, protein_NormAR, df] sheetnames = [ 'Med+Mean Calcs', 'Protein AR', 'ProtAR Norm to Med', 'Significance_test' ] output = output_path + sample_name + 'ProteinAbundance_Results.xlsx' FileHandling.df_to_excel(output, sheetnames, data_frames) logger.info(f"Dataframes saved to excel file at {output}...") logger.info(f"Preparing data for volcano plot") # Gathering data for the scatter (volcano) plot xdata, xlabel = (df['Log2 Av AR'], 'Log2 Av. Abundance Ratio') ydata, ylabel = (df['Log10 p-val'], '-Log10 p-value') title = sample_name datalabels = df['Accession'] colours = df['colours'] if simple: # for simple volcano plot, which is saved into the pdf fig1 = PlotUtils.simple_scatter(xdata, ydata, title, xlabel, ylabel, colours) output = output_path + sample_name + 'Simple_Volcano_' FileHandling.fig_to_pdf([fig1], output) FileHandling.fig_to_svg(['Simple_Volcano'], [fig1], output) plt.show(fig1) if interactive: # for interactive volcano plot # create the scatterplot fig2 = PlotUtils.inter_scatter(xdata, ydata, xlabel, ylabel, colours, title, datalabels) # initial drawing of the scatterplot plt.plot() logger.info("Interactive scatterplot done") # present the scatterplot #plt.show() output = output_path + sample_name + 'Interactive_Volcano_' FileHandling.fig_to_pdf([fig2], output) FileHandling.fig_to_svg(['Interactive_Volcano'], [fig2], output) if Bokeh_plot: output = output_path + sample_name + "_VolcanoPlot_Bokeh.html" output_file(output, title=sample_name) logger.info(f"Output html will be saved to {output_path}") hovers = [ ('Protein', '@Accession'), ('Gene', '@Description'), ] fig3 = PlotUtils.bokeh_volcano_maker(df=df, c_col='Log10 p-val', y_col='Log10 p-val', x_col='Log2 Av AR', title=sample_name + ' Volcano Plot', hover_list=hovers) show(fig3) # Saving figures to pdf and as svg files logger.info(f"Volcano plots saved to {output_path}") logger.info(f"Analysis complete for {sample_name}")