Example #1
0
def multifile_hist(input_folder, output_path, sample_name):

    logger.info('Input Folder detected: {}'.format(input_folder))

    #find all files that contain Compiled in name
    compiled_files = glob.glob(input_folder + '/*Compiled.xlsx')
    #if there are no Compiled files, generate them
    if not compiled_files:
        FileHandling.PD_compiler(input_folder)
        #redefine compiled_files to include newly generated
        compiled_files = glob.glob(input_folder + '/*Compiled.xlsx')
    #iterate through all compiled_files, creating and saving figures
    figs = []
    figures = {}
    for file in compiled_files:
        logger.info(file)
        fig_dict = PlotUtils.pep_abund_hist(file)
        figs.append(fig_dict)
    #save figs to pdf
    for fig_dict in figs:
        figures.update(fig_dict)
    FileHandling.fig_to_pdf(figures,
                            output_path,
                            fig_type=sample_name + 'PepAbundHist')
    logger.info(f"Figures saved to {output_path}")
    return figures
Example #2
0
def main(input_path, output_path, sample_name):
    logger.info('Input path: {}'.format(input_path))
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
    if os.path.isdir(input_path):
        figures = multifile_hist(input_path, output_path, sample_name)
    else:
        figures = PlotUtils.pep_abund_hist(input_path)
        FileHandling.fig_to_pdf(figures,
                                output_path,
                                fig_type=sample_name + 'PepAbundHist')
        logger.info(f"Figures saved to {output_path}")

    return figures
def main(input, output_path, sample_name, x_vals=None, test_elements=None, group_col=None, element_col=None, svg=None):
    pass

    logger.info(f"Analysing: {sample_name}")
    if not os.path.isdir(output_path):
        os.mkdir(output_path)
        logger.info(f"Output directory made at {output_path}")
    if isinstance(input, pd.DataFrame):
        logger.info('DataFrame input detected')
        raw_data = input
    elif os.path.isfile(input):
        logger.info(f'Input file being loaded from {input}')
        raw_data = pd.read_excel(input)
    else:
        logger.info(f"Incorrect input format detected. Please pass full file path, or a dataframe as input.")

    # Collects index column for plot names by default
    if not element_col:
        element_col = raw_data.index.tolist()
        logger.info(f'No element column detected. Using index instead.')


    #If a grouping column is passed (e.g. proteins) then all elements for that group are plotted together. Otherwise, individual elements are plotted separately, and are expected the be the index
    if group_col:
        col_for_dict = group_col
    else:
        col_for_dict = element_col

    per_element_dict = {}
    for element in raw_data[col_for_dict].unique():
        per_element_dict[element] = raw_data[raw_data[col_for_dict] == element]
    #If a test element is passed to function, then only produce the plot for that element, otherwise fit whole df
    if test_elements:
        data_for_fitting = {k: per_element_dict[k] for k in test_elements}
    else:
        data_for_fitting = per_element_dict
    #Create the figure for each group or element
    figure_dict = {}
    for key, value in per_element_dict.items():
        fig = per_protein_fitter(value, x_vals, group_col, element_col)
        fig.suptitle(key)
        figure_dict[key] = fig
        plt.show(fig)

    FileHandling.fig_to_pdf(figure_dict, output_path=output_path+sample_name, fig_type='_Sigmoids')
    if svg:
        FileHandling.fig_to_svg(fig_names=list(figure_dict.keys()), fig_list=list(figure_dict.values()), output_path=output_path+sample_name)
    logger.info(f'Figures saved to {output_path}')

    return figure_dict
def main(input_path, output_path, sample_name):
    """
    Master function to apply a list of functions to the input file, generating urea denaturation curve for each protein

    Parameters:
    input_path: string
        input path for the file to be processed
    output_path: string
        output path for which any output generated by functions will be saved
    sample_name: string
        sample name associated with the file to be processed.

    Returns:
    summary_table: DataFrame
        dataframe containing the summarised output of the functions
        applied in order
    """

    #av_summary = do_funcs(input_path, output_path, sample_name)
    logger.info('Input Path: {}'.format(input_path))

    logger.info(f'Preparing to process {sample_name}....')
    total_data = FileHandling.file_reader(input_path)
    quant_data, col_list = DataWrangling.quantified_data(total_data)
    two_unique_cys, cys_pep, non_cys_pep = DataWrangling.Unique_Cys_sorter(
        quant_data)
    #set index of summary dataframes to the protein accession
    cys_pep = cys_pep.set_index(["Master Protein Accessions"], drop=False)
    non_cys_pep = non_cys_pep.set_index(["Master Protein Accessions"],
                                        drop=False)

    non_cys_Av = CalcUtils.non_cys_AR(cys_pep, non_cys_pep)

    summary_table = CalcUtils.cys_div_noncys(cys_pep, non_cys_Av, col_list)

    #Saving all dataframes so far to excel results document
    data_frames = [
        total_data, quant_data, two_unique_cys, cys_pep, non_cys_pep,
        summary_table
    ]
    sheetnames = [
        'Total Data', 'Quant Data', 'TwoUniqueCYS', 'CysPep', 'NonCysPep',
        'Summary Table'
    ]
    FileHandling.df_to_excel(output_path, sheetnames, data_frames)

    #collect only columns of interest
    summary_table.reset_index(drop=True, inplace=True)
    ratio_col = [col for col in summary_table.columns if '_Cys/NonCys' in col]
    select_col = ['Master Protein Accessions', 'Annotated Sequence'
                  ] + ratio_col
    summary_data = summary_table[select_col]
    logger.debug(summary_data)
    #rename columns to simple names
    summary_data = summary_data.rename(columns={
        'Master Protein Accessions': 'ProteinID',
        'Annotated Sequence': 'Sequence'
    })
    #for peptides seen more than once in a sample, take average ratio to give only unique ratios for each peptide
    av_summary = CalcUtils.single_element_av(summary_data, 'Sequence')

    ##Filtering for proteins which have too many missing values, and generating plots.
    logger.info('Filtering for missing values...')
    #removing rows with >thresh Nans
    filtered_consensus = DataWrangling.filter_NaNs(av_summary,
                                                   filter_type='total',
                                                   threshold=0)
    #preparing variables and plotting scatter for each protein
    logger.info('Creating scatter plots...')
    urea_conc = [0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5, 5.5, 6]
    fig_dict = PlotUtils.multirow_scatter(filtered_consensus,
                                          key='ProteinID',
                                          col_head='Sequence',
                                          x_vals=urea_conc,
                                          x_label='Urea Conc',
                                          y_label='Cys_NonCys')

    #to save all figures to pdf
    FileHandling.fig_to_pdf(fig_dict, output_path + 'Thresholded_')
    logger.info('Save figs to pdf complete')
    #to show all figures as output,
    for protein, figure in fig_dict.items():
        plt.show(figure)

    Threshold_0 = filtered_consensus
    dfs = [Threshold_0]
    sheetnames = ['Total_0']
    FileHandling.df_to_excel(output_path=output_path + 'Thresholded_',
                             sheetnames=sheetnames,
                             data_frames=dfs)

    return summary_data
Example #5
0
def main(input_path, output_path, sample_name, do_plots=True):
    """
    Master function to apply a list of functions to the input file

    Parameters:
    input_path: string
        input path for the file to be processed
    output_path: string
        output path for which any output generated by functions will be saved
    sample_name: string
        sample name associated with the file to be processed.

    Returns:
    summary_table: DataFrame
        dataframe containing the summarised output of the functions
        applied in order
    """
    logger.info(f'Preparing to process: {sample_name}')
    logger.info(f"Input Path: {input_path}")

    total_data = FileHandling.file_reader(input_path)

    quant_data, col_list = DataWrangling.quantified_data(total_data)
    #raj only considers peptides that are observed in all replicates
    quant_data = quant_data.dropna(axis=0,
                                   how='any',
                                   thresh=None,
                                   subset=col_list)
    two_unique_cys, cys_pep, non_cys_pep = DataWrangling.Unique_Cys_sorter(
        quant_data)
    #set index of summary dataframes to the protein accession
    cys_pep = cys_pep.set_index(["Master Protein Accessions"], drop=False)
    non_cys_pep = non_cys_pep.set_index(["Master Protein Accessions"],
                                        drop=False)

    non_cys_Av = CalcUtils.non_cys_AR(cys_pep, non_cys_pep)

    summary_table = CalcUtils.cys_div_noncys(cys_pep, non_cys_Av, col_list)

    #collect list of columns for each type of ratio
    summary_table.reset_index(drop=True, inplace=True)
    abundance_cols = [
        col for col in summary_table.columns if 'Abundance Ratio: (' in col
    ]
    ratio_col = [col for col in abundance_cols if '_Cys/NonCys' in col]
    logger.info(f"Collecting ratio columns: {ratio_col}")
    NC_col = [col for col in abundance_cols if '_NC' in col]
    logger.info(f"Collecting NonCys columns: {NC_col}")
    C_col = [col for col in abundance_cols if ')_' not in col]
    logger.info(f"Collecting Cys columns: {C_col}")

    #collect only columns of interest for summary table
    select_col = ['Master Protein Accessions', 'Annotated Sequence'
                  ] + abundance_cols
    summary_data = summary_table[select_col]
    summary_data.dropna(axis=0,
                        how='any',
                        thresh=None,
                        subset=abundance_cols,
                        inplace=True)
    summary_data = summary_data.reset_index(drop=True)
    logger.debug(F"Summary data acquired: {summary_data}")

    #rename columns to simple names
    summary_data = summary_data.rename(columns={
        'Master Protein Accessions': 'ProteinID',
        'Annotated Sequence': 'Sequence'
    })

    #####Paired T-test
    logger.info("Calculating t-test statistics")
    summary_data = CalcUtils.t_test_pair(summary_data, C_col, NC_col)

    #####Include -log10 of p-value
    summary_data['-Log10 p-Value'] = -np.log10(summary_data['p-value'])

    #####Include Log2 of Average ratios
    summary_data = CalcUtils.row_mean(summary_data, ratio_col, 'Av. Ratio')
    summary_data['Log2 Average Ratio'] = np.log2(summary_data['Av. Ratio'])

    ##### Include average col for NC
    summary_data = CalcUtils.row_mean(summary_data, NC_col, 'NC Average')
    summary_data['Log2 Average NC'] = np.log2(summary_data['NC Average'])

    #####Assign colour column
    summary_data = DataWrangling.colour_column(summary_data, '-Log10 p-Value',
                                               'p-value colour')
    logger.info("Completed calculations with summary data table")
    logger.debug(f"Summary data table post calculations: {summary_data}")

    #Saving all dataframes so far to excel results document
    data_frames = [
        total_data, quant_data, two_unique_cys, cys_pep, non_cys_pep,
        summary_table, summary_data
    ]
    sheetnames = [
        'Total Data', 'Quant Data', 'TwoUniqueCYS', 'CysPep', 'NonCysPep',
        'Summary Info', 'Summary Data'
    ]
    FileHandling.df_to_excel(output_path=output_path + sample_name +
                             '_Foldedness_',
                             sheetnames=sheetnames,
                             data_frames=data_frames)
    logger.info("All dataframes saved to {output_path}")

    if do_plots:
        logger.info(f"Preparing foldedness scatterplot for {sample_name}")
        figures = {}
        figures['Foldedness Scatter'] = foldedness_scatter(
            summary_data, sample_name)
        #### Save figs to pdf
        FileHandling.fig_to_pdf(figures,
                                output_path=output_path,
                                fig_type=sample_name + '_Foldedness')
        logger.info(f"Figures saved to {output_path}")
        for key, value in figures.items():
            plt.show(value)

    return summary_data
def main(input_path,
         output_path,
         sample_name,
         sample_type='whole_cell',
         replicate_threshold=0,
         simple=True,
         interactive=True,
         Bokeh_plot=True):

    logger.info(f"Analysing: {sample_name}")
    if not os.path.isdir(output_path):
        os.mkdir(output_path)

    ## COLLECTING PEPTIDE ABUNDANCES FOR NORMALISATION ##
    # opening sheet by calling sheet_reader on path
    sheetname = 'Peptides'
    peptides_raw = pd.read_excel(input_path, sheetname)
    logger.info(f"Peptide data collected from {input_path}")
    logger.info(f"Number of peptides detected: {peptides_raw.shape[0]}")

    #collect list of columns containing abundance Ratios
    col_list = [
        col for col in peptides_raw.columns if 'Abundance Ratio: (' in col
    ]
    logger.info(f"Columns detected for analysis: {col_list}")

    # calculating mean and median for pep abundance using mean_median_calc function
    calcs_dict = {}
    for column in col_list:
        vals = peptides_raw[column].dropna()
        mean_val = np.mean(vals)
        median_val = np.median(vals)
        calcs_dict[column] = [mean_val, median_val]
        logger.info(f"Med. and Mean calculated for {column}")

    # convert calcs to dataframe and set column labels
    calcs = pd.DataFrame.from_dict(calcs_dict, orient='index')
    calcs.columns = ['Mean', 'Median']
    calcs = calcs.sort_index()
    median_list = calcs['Median']
    mean_list = calcs['Mean']

    logger.info(f"Median: {median_list}")
    logger.info(f"Mean: {mean_list}")

    ## COLLECTING PROTEIN ABUNDANCES FOR VOLCANO PLOT ##
    # opening all Protein excel sheets by calling sheet_reader on path
    sheetname = 'Proteins'
    proteins_raw = pd.read_excel(input_path, sheetname)
    logger.info(f"Proteins imported from {input_path}")

    # creating summary data_frame of original protein data
    summary_cols = ['Accession', 'Description'] + col_list
    logger.info(f"Columns for summary: {summary_cols}")
    protein_AR_summary = proteins_raw[summary_cols]
    # remove any proteins not seen in all replicates
    protein_AR_summary = DataWrangling.filter_NaNs(
        protein_AR_summary, filter_type='total', threshold=replicate_threshold)
    protein_AR_summary.reset_index(inplace=True, drop=True)
    logger.info(f"Protein AR: {protein_AR_summary.head(5)}")

    # Normalising each dataset to the Median Peptide Abundance
    protein_NormAR = protein_AR_summary.copy()
    for col in col_list:
        protein_NormAR[col] = protein_AR_summary[col] / median_list[col]
    logger.info(
        f"Protein abundances normalised to median peptide abundance: {protein_NormAR.head(5)}"
    )

    # If IP sample, take the Log2 of each sample for t-tests
    if sample_type == "IP":
        logger.info(f'{sample_type} sample detected.')
        protein_Log2 = protein_NormAR.copy()
        for col in col_list:
            protein_Log2[col] = np.log2(protein_NormAR[col])
        logger.info(
            f"Log2 of Protein normalisaed abundances calculated: {protein_Log2.head(5)}"
        )
    else:
        logger.info(
            f'{sample_type} sample detected. Using normalised abundances for one sample t-test'
        )

    # Complete one-sample t-test on each row of NormProtAR using t-test_1samp function
    if sample_type == 'whole_cell':
        popmean = 1
        df = protein_NormAR
    elif sample_type == 'IP':
        popmean = 0
        df = protein_Log2

    logger.info(
        f"Calculating One Sample t-test with population mean {popmean}")
    df = CalcUtils.t_test_1samp(df, popmean, col_list)

    # Calculating the average abundance ratio
    logger.info(f"Calculating mean normalised Abundance Ratio...")
    df = CalcUtils.row_mean(df, col_list, 'Average')

    # Appending other columns of interest for the volcano plot
    # A volcano plot is constructed by plotting the negative log
    # of the p value on the y axis (usually base 10). This results
    # in data points with low p values (highly significant) appearing toward the top of the plot.
    df['Log10 p-val'] = -(np.log10(df['p-value']))
    if sample_type == 'whole_cell':
        logger.info(
            f"Calculating Log2 Average normalised Abundance Ratio, and -Log10(p-value)..."
        )
        df['Log2 Av AR'] = np.log2(df['Average'])

    elif sample_type == 'IP':
        logger.info(
            f"Calculating Average Log2 normalised Abundance Ratio, and -Log10(p-value)..."
        )
        df['Log2 Av AR'] = protein_Log2['Average']

    # To produce the colour column, change x and y limits in original function
    logger.info(f"Producing colour column...")
    xcol = 'Log2 Av AR'
    ycol = 'Log10 p-val'
    df = CalcUtils.colour_column_volc(df, xcol, ycol)
    logger.info(f"Post calculation results: {df.head(5)}")

    # Collecting dataframes and descriptors to save using the df_to_excel function
    data_frames = [calcs, protein_AR_summary, protein_NormAR, df]
    sheetnames = [
        'Med+Mean Calcs', 'Protein AR', 'ProtAR Norm to Med',
        'Significance_test'
    ]
    output = output_path + sample_name + 'ProteinAbundance_Results.xlsx'
    FileHandling.df_to_excel(output, sheetnames, data_frames)
    logger.info(f"Dataframes saved to excel file at {output}...")

    logger.info(f"Preparing data for volcano plot")
    # Gathering data for the scatter (volcano) plot
    xdata, xlabel = (df['Log2 Av AR'], 'Log2 Av. Abundance Ratio')
    ydata, ylabel = (df['Log10 p-val'], '-Log10 p-value')
    title = sample_name
    datalabels = df['Accession']
    colours = df['colours']

    if simple:
        # for simple volcano plot, which is saved into the pdf
        fig1 = PlotUtils.simple_scatter(xdata, ydata, title, xlabel, ylabel,
                                        colours)
        output = output_path + sample_name + 'Simple_Volcano_'
        FileHandling.fig_to_pdf([fig1], output)
        FileHandling.fig_to_svg(['Simple_Volcano'], [fig1], output)
        plt.show(fig1)

    if interactive:
        # for interactive volcano plot
        # create the scatterplot
        fig2 = PlotUtils.inter_scatter(xdata, ydata, xlabel, ylabel, colours,
                                       title, datalabels)
        # initial drawing of the scatterplot
        plt.plot()
        logger.info("Interactive scatterplot done")
        # present the scatterplot
        #plt.show()
        output = output_path + sample_name + 'Interactive_Volcano_'
        FileHandling.fig_to_pdf([fig2], output)
        FileHandling.fig_to_svg(['Interactive_Volcano'], [fig2], output)

    if Bokeh_plot:
        output = output_path + sample_name + "_VolcanoPlot_Bokeh.html"
        output_file(output, title=sample_name)
        logger.info(f"Output html will be saved to {output_path}")

        hovers = [
            ('Protein', '@Accession'),
            ('Gene', '@Description'),
        ]

        fig3 = PlotUtils.bokeh_volcano_maker(df=df,
                                             c_col='Log10 p-val',
                                             y_col='Log10 p-val',
                                             x_col='Log2 Av AR',
                                             title=sample_name +
                                             ' Volcano Plot',
                                             hover_list=hovers)
        show(fig3)

    # Saving figures to pdf and as svg files
    logger.info(f"Volcano plots saved to {output_path}")
    logger.info(f"Analysis complete for {sample_name}")