def output_psm_level_report(df, output_stem: str): df = df[~df.uniprot.str.startswith('Reverse_')] df = df[~df.uniprot.str.startswith('contaminant')] psm_df = df.set_index( ['uniprot', 'symbol', 'description', 'clean_sequence']).sort_index(level=1) psm_df = psm_df.drop(columns=['unique_sequence']) psm_df.to_excel( utils.get_timestamped_report_path(f'{output_stem}_{{}}.xlsx', DATA_OUTPUT_PATH), ) return psm_df
def tabulate_timecourse(glob, name): time_course_path = utils.get_newest_file(DATA_OUTPUT_PATH, glob) df = pd.read_excel(time_course_path, index_col='uniprot') chase_col = 'DMSO t0/t1' palm_protect_col = 'Palm M t1/DMSO t1' abd_protect_col = 'ABD957 t1/DMSO t1' df[chase_col] = df['DMSO t0 (mean)'] / df['DMSO t1 (mean)'] df[palm_protect_col] = df['Palm M t1 (mean)'] / df['DMSO t1 (mean)'] df[abd_protect_col] = df['ABD957 t1 (mean)'] / df['DMSO t1 (mean)'] dynamic = df[chase_col] >= DYNAMIC_FOLD_CHANGE_THRESHOLD palm_protected = df[palm_protect_col] >= CHASE_FOLD_CHANGE_THRESHOLD abd_protected = df[abd_protect_col] >= CHASE_FOLD_CHANGE_THRESHOLD dynamic_colname = 'Dynamically palmitoylated?' df[dynamic_colname] = 'no' df.loc[dynamic, dynamic_colname] = 'yes' desired_cols = [ 'symbol', 'description', 'DMSO t0 (mean)', 'Palm M t0 (mean)', 'ABD957 t0 (mean)', 'DMSO t1 (mean)', 'Palm M t1 (mean)', 'ABD957 t1 (mean)', dynamic_colname, ] output_path = utils.get_timestamped_report_path( f'timecourse_{name.replace("-", "").lower()}_supp_table_extra_sheets_{{}}.xlsx', DATA_OUTPUT_PATH) with pd.ExcelWriter(output_path) as w: df[dynamic][desired_cols].drop(columns=[dynamic_colname]).to_excel( w, sheet_name=f'Dynamic ({name})') df[palm_protected][desired_cols].to_excel( w, sheet_name=f'Palm M Regulated ({name})') df[abd_protected][desired_cols].to_excel( w, sheet_name=f'ABD957 M Regulated ({name})')
def unfiltered_report(analysis, report_output_prefix: str): m = analysis.data_model query = (m .select( m.id, m.experiment, m.uniprot, m.symbol, m.description, m.sequence, m.mass, m.charge, m.rsquared, m.ratio, ) .where( m.experiment_id.in_(analysis.experiment_ids_included) ) ) df = pd.DataFrame.from_records(list(query.dicts())) for dataset in analysis.datasets: df.loc[df.experiment.isin(dataset.experiment_ids_included), 'condition'] = dataset.name df = df[~df.uniprot.str.startswith('Reverse_')] df['description'] = df.description.str.split().str[1:].str.join(' ') df = df.set_index(['uniprot', 'symbol', 'description', 'condition', 'experiment']).sort_index(level=0) report_output_path = utils.get_timestamped_report_path( f'unfiltered_{report_output_prefix}_{{}}.xlsx', pathlib.Path(analysis.params.output_folder), ) df.to_excel(report_output_path, encoding='utf-8-sig') return df
def tabulate_hydroxylamine(): nb4_path = utils.get_newest_file(DATA_OUTPUT_PATH, 'filtered_hydroxylamine_nb4_*.xlsx') oci_path = utils.get_newest_file(DATA_OUTPUT_PATH, 'filtered_hydroxylamine_oci_p_*.xlsx') on_path = utils.get_newest_file(DATA_OUTPUT_PATH, 'filtered_hydroxylamine_oci_o_*.xlsx') oci = get_hydroxylamine_sensitive_df(oci_path, name='OCI-AML3') nb4 = get_hydroxylamine_sensitive_df(nb4_path, name='NB-4') on = get_hydroxylamine_sensitive_df(on_path, name='ON') df = oci.join(nb4, how='outer') df = df.join(on, how='outer') df['symbol'] = df['symbol (OCI-AML3)'].fillna(df['symbol (NB-4)']).fillna( df['symbol (ON)']) df['description'] = df['description (OCI-AML3)'].fillna( df['description (NB-4)']).fillna(df['description (ON)']) df = df[[ 'symbol', 'description', 'mean_reduction (OCI-AML3)', 'mean_reduction (NB-4)', 'mean_reduction (ON)', ]] df = df.sort_values(by='mean_reduction (OCI-AML3)', ascending=False).fillna('-') swisspalm = pd.read_excel('input/swisspalm_search.xlsx') swisspalm = swisspalm.set_index('Query identifier') swisspalm = swisspalm[[ # 'Query identifier', # 'UniProt AC', # 'UniProt ID', # 'UniProt status', # 'Organism', # 'Gene names', # 'Description', 'Number of palmitoyl-proteomics articles', 'Number of palmitoyl-proteomics studies where the protein appears in a high confidence hit list', 'Number of technique categories used in palmitoyl-proteomics studies', 'Technique categories used in palmitoyl-proteomics studies', 'Number of targeted studies', 'Targeted studies (PMIDs)', # 'PATs', # 'APTs', 'Number of sites', 'Sites in main isoform', 'Number of isoforms', 'Max number of cysteines', 'Max number of cysteines in TM or cytosolic domain', 'Predicted to be S-palmitoylated?', 'Predicted to be S-palmitoylated in cytosolic domains?', 'Protein has hits in SwissPalm?', 'Orthologs of this protein have hits in SwissPalm?' ]] df = df.join(swisspalm) df = df.fillna('-') output_path = utils.get_timestamped_report_path( 'hydroxylamine_supp_table_extra_sheets_{}.xlsx', DATA_OUTPUT_PATH) with pd.ExcelWriter(output_path) as w: df.to_excel(w, sheet_name='Hydroxylamine Sensitive')
def timecourse(ha_proteins): OUTPUT_PSM_LEVEL = True OUTPUT_TO_EXCEL = True MIN_NUM_UNIQUE_SEQUENCES = 2 oci = filter_timecourse_data( 'oci_p', 'resubmission_17odya.yaml', ha_proteins, min_num_unique_sequences=MIN_NUM_UNIQUE_SEQUENCES, min_num_datasets=2, output_psm_level=OUTPUT_PSM_LEVEL, output_to_excel=OUTPUT_TO_EXCEL) oci_on = filter_timecourse_data( 'oci_o', 'resubmission_ocio_17odya.yaml', ha_proteins, min_num_unique_sequences=MIN_NUM_UNIQUE_SEQUENCES, min_num_datasets=2, output_psm_level=OUTPUT_PSM_LEVEL, output_to_excel=OUTPUT_TO_EXCEL) nb4_10plex = filter_timecourse_data( 'nb4', 'resubmission_nb4_17odya.yaml', ha_proteins, min_num_unique_sequences=MIN_NUM_UNIQUE_SEQUENCES, min_num_datasets=1, output_psm_level=OUTPUT_PSM_LEVEL, output_to_excel=OUTPUT_TO_EXCEL) nb4_6plex = filter_timecourse_data( '6plex_nb4', 'resubmission_nb4_17odya_6plex.yaml', ha_proteins, min_num_unique_sequences=MIN_NUM_UNIQUE_SEQUENCES, min_num_datasets=1, output_psm_level=OUTPUT_PSM_LEVEL, output_to_excel=OUTPUT_TO_EXCEL) #%% both = nb4_10plex.join(nb4_6plex, how='outer', lsuffix='_10plex', rsuffix='_6plex') both = both.drop(columns=list(both.filter(regex='mean'))) # both = both.drop(columns=both.filter(regex='Palm').columns) means = both.filter(regex='percent').groupby(lambda x: x.split('.')[0], axis=1).mean() means.columns = [f'{c} (mean)' for c in list(means.columns)] counts = both.filter(regex='percent').groupby(lambda x: x.split('.')[0], axis=1).count() counts.columns = [f'{c} (count)' for c in list(counts.columns)] by_condition = both.filter(regex='percent').groupby( lambda x: x.split('.')[0], axis=1) stdevs = by_condition.std(ddof=0) stdevs.columns = [f'{c} (stdev)' for c in list(stdevs.columns)] both = both.join(means) both = both.join(counts) both = both.join(stdevs) both['symbol'] = both.symbol_6plex.fillna(both.symbol_10plex) both['description'] = both.description_6plex.fillna( both.description_10plex) both = both[both['ABD957 t1 (count)'].ge(4)] both = both[both['DMSO t1 (count)'].ge(4)] both = both[both['ABD957 t0 (count)'].ge(2)] both = both[both['DMSO t0 (count)'].ge(2)] both = both[~stdevs.ge(100).any(axis=1)] both = both[[ 'symbol', 'description', 'DMSO t0 (mean)', 'Palm M t0 (mean)', 'ABD957 t0 (mean)', 'DMSO t1 (mean)', 'ABD957 t1 (mean)', 'Palm M t1 (mean)', # 'ABD957 t0 (count)', # 'ABD957 t1 (count)', # 'DMSO t0 (count)', # 'DMSO t1 (count)', # 'Palm M t0 (count)', # 'Palm M t1 (count)', # 'ABD957 t0 (stdev)', # 'ABD957 t1 (stdev)', # 'DMSO t0 (stdev)', # 'DMSO t1 (stdev)', # 'Palm M t0 (stdev)', # 'Palm M t1 (stdev)', 'num_unique_peptides (Replicate 1)_10plex', 'num_unique_peptides (Replicate 2)_10plex', 'num_unique_peptides (Replicate 1)_6plex', 'num_unique_peptides (Replicate 2)_6plex', 'DMSO t0.percent_of_control_0 (Replicate 1)_10plex', 'DMSO t0.percent_of_control_1 (Replicate 1)', 'DMSO t0.percent_of_control_0 (Replicate 2)_10plex', 'DMSO t0.percent_of_control_1 (Replicate 2)', 'DMSO t0.percent_of_control_0 (Replicate 1)_6plex', 'DMSO t0.percent_of_control_0 (Replicate 2)_6plex', 'Palm M t0.percent_of_control_0 (Replicate 1)', 'Palm M t0.percent_of_control_0 (Replicate 2)', 'ABD957 t0.percent_of_control_0 (Replicate 1)_10plex', 'ABD957 t0.percent_of_control_1 (Replicate 1)', 'ABD957 t0.percent_of_control_0 (Replicate 2)_10plex', 'ABD957 t0.percent_of_control_1 (Replicate 2)', 'ABD957 t0.percent_of_control_0 (Replicate 1)_6plex', 'ABD957 t0.percent_of_control_0 (Replicate 2)_6plex', 'DMSO t1.percent_of_control_0 (Replicate 1)_10plex', 'DMSO t1.percent_of_control_1 (Replicate 1)', 'DMSO t1.percent_of_control_2 (Replicate 1)', 'DMSO t1.percent_of_control_0 (Replicate 2)_10plex', 'DMSO t1.percent_of_control_1 (Replicate 2)', 'DMSO t1.percent_of_control_2 (Replicate 2)', 'DMSO t1.percent_of_control_0 (Replicate 1)_6plex', 'DMSO t1.percent_of_control_0 (Replicate 2)_6plex', 'Palm M t1.percent_of_control_0 (Replicate 1)', 'Palm M t1.percent_of_control_0 (Replicate 2)', 'ABD957 t1.percent_of_control_0 (Replicate 1)_10plex', 'ABD957 t1.percent_of_control_1 (Replicate 1)', 'ABD957 t1.percent_of_control_2 (Replicate 1)', 'ABD957 t1.percent_of_control_0 (Replicate 2)_10plex', 'ABD957 t1.percent_of_control_1 (Replicate 2)', 'ABD957 t1.percent_of_control_2 (Replicate 2)', 'ABD957 t1.percent_of_control_0 (Replicate 1)_6plex', 'ABD957 t1.percent_of_control_0 (Replicate 2)_6plex', ]] both_output_path = utils.get_timestamped_report_path( 'nb4_6plex_10plex_{}.xlsx', DATA_OUTPUT_PATH) both.to_excel(both_output_path)
def filter_hydroxylamine_data(params_filename: str, output_name: str, output_psm_level=False, output_to_excel=True, min_num_unique_sequences=1, min_num_datasets=1): analysis, params = analyze(params_filename, user='******') dataset = analysis.datasets[0] df = analysis_to_df(analysis, dataset.channel_layout, dataset.control_channels) if output_psm_level: output_psm_level_report(df, f'unfiltered_{output_name}_psm_level') df = df[~df.index.isin(analysis.filtered_out)] df = df[~df.uniprot.str.startswith('contaminant_')] # processing for filtered protein level table and plotting def agg(x): result = x.filter(regex='percent_of_control').mean() result['num_unique_peptides'] = x['clean_sequence'].nunique() return result result = group_by_protein_and_filter(df, agg, min_num_unique_sequences, min_num_datasets) result.columns = pd.MultiIndex.from_tuples( (i.replace('.percent_of_control_', ' '), j) for i, j in result.columns) result.columns.set_levels( [f'Replicate {i}' for i in range(1, len(dataset.experiments) + 1)], level=1, inplace=True) means = result.drop(columns=['num_unique_peptides']).groupby( level=0, axis=1).mean() result = add_meta(result, meta=df[['uniprot', 'symbol', 'description']]) result = pd.merge(result, means, left_index=True, right_index=True) result['mean_reduction'] = 100 - means.filter(regex='Hydroxylamine').mean( axis=1) # ordering for supp table cols = list(result.columns) first_cols = [ 'symbol', 'description', 'mean_reduction', 'PBS 0', 'PBS 1', 'PBS 2', 'Hydroxylamine 0', 'Hydroxylamine 1', 'Hydroxylamine 2' ] replicate_cols = [ c for c in cols if c not in first_cols and c[0] != 'num_unique_peptides' ] num_peptide_cols = [ c for c in cols if c not in first_cols and c[0] == 'num_unique_peptides' ] result = result[first_cols + num_peptide_cols + replicate_cols] result = result.rename(columns=dict( zip(replicate_cols, ['{} ({})'.format(*c) for c in replicate_cols]))) result = result.rename(columns=dict( zip(num_peptide_cols, ['{} ({})'.format(*c) for c in num_peptide_cols]))) result.index.rename('uniprot', inplace=True) if output_to_excel: result.to_excel( utils.get_timestamped_report_path( f'filtered_hydroxylamine_{output_name}_{{}}.xlsx', DATA_OUTPUT_PATH), freeze_panes=(1, 1), # index=False ) return result
def filter_timecourse_data( output_name, params_filename, uniprots_passing_ha_filter, min_num_unique_sequences=1, min_num_datasets=1, output_psm_level=False, output_to_excel=True, ): analysis, params = analyze(params_filename, user='******') dataset = analysis.datasets[0] df = analysis_to_df(analysis, dataset.channel_layout, dataset.control_channels) if output_psm_level: output_psm_level_report( df, f'unfiltered_timecourse_{output_name}_psm_level') df = df[~df.index.isin(analysis.filtered_out)] df = df[~df.uniprot.str.startswith('contaminant_')] df = df[df.uniprot.isin(uniprots_passing_ha_filter)] def agg(x): percentages = x.filter(regex='percent_of_control') cv = percentages.apply(stats.variation) have_cv_ge = percentages[cv[cv.ge(0.5)].index.values] if len(x) > 2 and not have_cv_ge.empty: to_filter_out = have_cv_ge.apply( stats.zscore).abs().idxmax().values x = x[~x.index.isin(to_filter_out)] result = x.filter(regex='percent_of_control').mean() result['num_unique_peptides'] = x['clean_sequence'].nunique() return result # group by protein result = group_by_protein_and_filter(df, agg, min_num_unique_sequences, min_num_datasets) # result.columns = pd.MultiIndex.from_tuples( # (i.split('.percent')[0], j) for i, j in result.columns # ) result.columns.set_levels( [f'Replicate {i}' for i in range(1, len(dataset.experiments) + 1)], level=1, inplace=True) means = result.drop(columns=['num_unique_peptides']).groupby( level=0, axis=1).mean() means = means.groupby(by=lambda x: x.split('.percent')[0], axis='columns').mean() result = add_meta(result, meta=df[['uniprot', 'symbol', 'description']]) result = pd.merge(result, means, left_index=True, right_index=True) # ordering for supp table cols = list(result.columns) unique_conditions = list( SortedSet([next(iter(x.values())) for x in dataset.channel_layout])) first_cols = ['symbol', 'description', *unique_conditions] replicate_cols = [ c for c in cols if c not in first_cols and c[0] != 'num_unique_peptides' ] num_peptide_cols = [ c for c in cols if c not in first_cols and c[0] == 'num_unique_peptides' ] result = result[first_cols + num_peptide_cols + replicate_cols] result = result.rename( columns={x: f'{x} (mean)' for x in unique_conditions}) result = result.rename(columns=dict( zip(replicate_cols, ['{} ({})'.format(*c) for c in replicate_cols]))) result = result.rename(columns=dict( zip(num_peptide_cols, ['{} ({})'.format(*c) for c in num_peptide_cols]))) result.index.rename('uniprot', inplace=True) result.to_excel( utils.get_timestamped_report_path( f'filtered_timecourse_{output_name}_{{}}.xlsx', DATA_OUTPUT_PATH), freeze_panes=(1, 1), # index=False ) return result
def filter_report(self): datasets_to_filter = self.experiment_ids_included m = self.data_model query = (m .select( m.id, m.experiment, m.uniprot, m.symbol, m.description, m.sequence, m.clean_sequence, m.ratio, m.num_ms2, m.rsquared, m.charge, m.meta ) .where(m.experiment_id.in_(datasets_to_filter)) ) df = pd.DataFrame.from_records(list(query.dicts())) df = self.dataset_class.generate_id(df) for dataset in self.datasets: df.loc[df.experiment.isin(dataset.experiment_ids_included), 'condition'] = dataset.name df = df.set_index('id') df = df[[ '_id', 'experiment', 'condition', 'uniprot', 'symbol', 'sequence', 'meta', 'num_ms2', 'rsquared', 'ratio', ]] for filtered_out in self._analysis.filters.values(): for cat, f in filtered_out.items(): for filter_name, filtered_ids in f.items(): df.loc[filtered_ids, f'{cat}.{filter_name}'] = False # this should probably be just done in SQL # experiment_ids = df.experiment.unique().tolist() # q2 = Experiment.select(Experiment.id, Experiment.source_url).where(Experiment.id.in_(experiment_ids)) # experiments = dict(list(q2.tuples())) # df.link = df.apply(lambda x: experiments[x.experiment] + x.link.split('"')[1], axis=1) report_output_name_template = 'filter_report_{}_{}_{{}}.xlsx'.format( self.name, self._analysis.id ) report_output_path = utils.get_timestamped_report_path( report_output_name_template, self.output_path ) # # df.set_index(['seq_id', 'condition', 'experiment']).sort_index(level=0).to_excel(report_output_path) df.set_index(['_id', 'experiment', 'condition']).sort_index(level=0).to_excel(report_output_path) return df
def sh_analysis(params_file: str, report_output_prefix: str): with open('input/human.json') as f: whitelist = json.loads(f.read()) blacklist = [ 'P35030', 'P07477', 'P07478', # trypsins 'O15427', # SLC16A3 'P00734', # F2 'Q14703', # MBTSP1 (serine protease) 'Q8NBP7', # PCSK9 ] analysis, params = analyze(params_file, user='******') m = analysis.data_model query = (m .select( m.id, m.experiment, m.uniprot, m.symbol, m.ratio, ) .where( (m.experiment_id.in_(analysis.experiment_ids_included)) & (m.id.not_in(analysis.filtered_out)) ) ) df = pd.DataFrame.from_records(list(query.dicts())) df = analysis.dataset_class.generate_id(df) for dataset in analysis.datasets: df.loc[df.experiment.isin(dataset.experiment_ids_included), 'condition'] = dataset.name df = df.set_index('id') df = df[df.uniprot.isin(whitelist)] df = df[~df.uniprot.isin(blacklist)] result_df = (df .groupby(['uniprot', 'symbol', 'condition', 'experiment']) .agg(ratio=('ratio', 'median'), num_peptides=('ratio', len)) .groupby(level=('uniprot', 'symbol', 'condition')) .agg(ratio=('ratio', 'median'), num_peptides=('num_peptides', 'sum'), ratio_list=('ratio', list)) ) # result_df['num_peptides'] = result_df.num_peptides.astype(int) result_df = result_df.unstack(level='condition') result_df['num_peptides'] = result_df.num_peptides.fillna(0).astype(int) # result_df['ratio'] = result_df.ratio.fillna('-') def ratios_to_string(ratios, invert=True): if not isinstance(ratios, list): return if invert: ratios = [1/r for r in ratios] return ', '.join(map(str, ratios)) result_df['ratio_list'] = result_df.ratio_list.transform(lambda x: x.apply(ratios_to_string)) result_df['ratio'] = result_df.ratio.rdiv(1) result_df.columns = result_df.columns.swaplevel() sorted_col_multiindex, _ = result_df.columns.sortlevel() result_df = result_df[sorted_col_multiindex] # result_df.loc[:, pd.IndexSlice[:, 'ratio']] = (result_df # .loc[:, pd.IndexSlice[:, 'ratio']] # .rdiv(1) # ) result_df = (result_df .reset_index() .sort_values(by=[(analysis.datasets[0].name, 'ratio'), 'symbol'], ascending=True) .fillna('-') ) report_output_path = utils.get_timestamped_report_path( f'{report_output_prefix}_{{}}.csv', pathlib.Path(analysis.params.output_folder), ) result_df.to_csv(report_output_path, index=False, encoding='utf-8-sig') return analysis, result_df
'DKO1 ABD957 t0.percent_of_control_0 (Replicate 1)_16plex', 'DKO1 ABD957 t0.percent_of_control_1 (Replicate 1)_16plex', 'Parental ABD957 t0.percent_of_control_0 (Replicate 1)_16plex', 'Parental ABD957 t0.percent_of_control_1 (Replicate 1)_16plex', 'DKO1 DMSO t1.percent_of_control_0 (Replicate 1)_10plex', 'DKO1 DMSO t1.percent_of_control_1 (Replicate 1)_10plex', 'DKO1 DMSO t1.percent_of_control_2 (Replicate 1)_10plex', 'DKO1 DMSO t1.percent_of_control_0 (Replicate 1)_16plex', 'DKO1 DMSO t1.percent_of_control_1 (Replicate 1)_16plex', 'Parental DMSO t1.percent_of_control_0 (Replicate 1)_16plex', 'Parental DMSO t1.percent_of_control_1 (Replicate 1)_16plex', 'DKO1 ABD957 t1.percent_of_control_0 (Replicate 1)_10plex', 'DKO1 ABD957 t1.percent_of_control_1 (Replicate 1)_10plex', 'DKO1 ABD957 t1.percent_of_control_2 (Replicate 1)_10plex', 'DKO1 ABD957 t1.percent_of_control_0 (Replicate 1)_16plex', 'DKO1 ABD957 t1.percent_of_control_1 (Replicate 1)_16plex', 'Parental ABD957 t1.percent_of_control_0 (Replicate 1)_16plex', 'Parental ABD957 t1.percent_of_control_1 (Replicate 1)_16plex', ]] both = both.rename( columns=lambda x: x.replace('Replicate 1)_10plex', 'Experiment 1)_10plex')) both = both.rename( columns=lambda x: x.replace('Replicate 1)_16plex', 'Experiment 2)_16plex')) both_output_path = utils.get_timestamped_report_path( 'dko_16plex_10plex_{}.xlsx', pathlib.Path('output')) both.to_excel(both_output_path) # %%