def make_stats_table( input_fname='', input_dataset=[], include_lcmsruns=[], exclude_lcmsruns=[], include_groups=[], exclude_groups=[], output_loc=None, min_peak_height=0, rt_tolerance=np.inf, ppm_tolerance=np.inf, min_msms_score=0, min_num_frag_matches=0, allow_no_msms=False, min_relative_frag_intensity=None, use_labels=False, return_all=False, msms_refs_loc='/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v2.tab', dependencies={ 'peak_height': [], 'peak_area': ['peak_height'], 'rt_peak': ['peak_height', 'rt_delta'], 'rt_delta': ['peak_height'], 'mz_centroid': ['peak_height', 'mz_ppm'], 'mz_ppm': ['peak_height'], 'msms_score': ['peak_height', 'num_frag_matches'], 'num_frag_matches': ['peak_height', 'msms_score'] }): assert output_loc is not None or return_all if not input_dataset: metatlas_dataset = ma_data.get_dill_data( os.path.expandvars(input_fname)) else: metatlas_dataset = input_dataset if output_loc is not None and not os.path.exists(output_loc): os.mkdir(output_loc) # filter runs from the metatlas dataset if include_lcmsruns: metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_include_list( metatlas_dataset, 'lcmsrun', include_lcmsruns) if include_groups: metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_include_list( metatlas_dataset, 'group', include_groups) if exclude_lcmsruns: metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_exclude_list( metatlas_dataset, 'lcmsrun', exclude_lcmsruns) if exclude_groups: metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_exclude_list( metatlas_dataset, 'group', exclude_groups) file_names = ma_data.get_file_names(metatlas_dataset) compound_names = ma_data.get_compound_names(metatlas_dataset, use_labels=use_labels)[0] metrics = [ 'msms_score', 'num_frag_matches', 'mz_centroid', 'mz_ppm', 'rt_peak', 'rt_delta', 'peak_height', 'peak_area' ] dfs = {m: None for m in metrics} passing = { m: np.ones((len(compound_names), len(file_names))).astype(float) for m in metrics } for metric in ['peak_height', 'peak_area', 'rt_peak', 'mz_centroid']: dfs[metric] = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname=metric, use_labels=use_labels) dfs['mz_ppm'] = dfs['peak_height'].copy() dfs['mz_ppm'] *= np.nan dfs['msms_score'] = dfs['mz_ppm'].copy() dfs['num_frag_matches'] = dfs['mz_ppm'].copy() dfs['rt_delta'] = dfs['mz_ppm'].copy() passing['peak_height'] = (np.nan_to_num(dfs['peak_height'].values) >= min_peak_height).astype(float) msms_hits_df = dp.get_msms_hits( metatlas_dataset, use_labels, ref_index=['database', 'id', 'inchi_key', 'precursor_mz']) msms_hits_df.reset_index(inplace=True) for compound_idx, compound_name in enumerate(compound_names): ref_rt_peak = metatlas_dataset[0][compound_idx][ 'identification'].rt_references[0].rt_peak ref_mz = metatlas_dataset[0][compound_idx][ 'identification'].mz_references[0].mz dfs['rt_delta'].iloc[compound_idx] = abs( ref_rt_peak - dfs['rt_peak'].iloc[compound_idx]) passing['rt_delta'][compound_idx] = ( abs(ref_rt_peak - np.nan_to_num(dfs['rt_peak'].iloc[compound_idx].values)) <= rt_tolerance).astype(float) dfs['mz_ppm'].iloc[compound_idx] = 1e6 * ( abs(ref_mz - dfs['mz_centroid'].iloc[compound_idx]) / ref_mz) passing['mz_ppm'][compound_idx] = ( dfs['mz_ppm'].iloc[compound_idx].values <= ppm_tolerance).astype(float) inchi_key = metatlas_dataset[0][compound_idx][ 'identification'].compound[0].inchi_key for file_idx, file_name in enumerate(file_names): rows = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key) & (msms_hits_df['file_name'] == file_name) & ((abs(msms_hits_df['precursor_mz'].values.astype(float) - metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz)/metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz) \ <= metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz_tolerance*1e-6)] if len(rows) == 0: dfs['msms_score'].iat[compound_idx, file_idx] = np.nan dfs['num_frag_matches'].iat[compound_idx, file_idx] = np.nan else: dfs['msms_score'].iat[compound_idx, file_idx] = rows.loc[ rows['score'].idxmax()]['score'] dfs['num_frag_matches'].iat[compound_idx, file_idx] = rows.loc[ rows['score'].idxmax()]['num_matches'] passing['msms_score'] = (np.nan_to_num(dfs['msms_score'].values) >= min_msms_score).astype(float) passing['num_frag_matches'] = (np.nan_to_num( dfs['num_frag_matches'].values) >= min_num_frag_matches).astype(float) for metric in metrics: passing[metric][passing[metric] == 0] = np.nan stats_table = [] for metric in metrics: test = np.product(np.array( [passing[dep] for dep in dependencies[metric]]), axis=0) # group_df = (dfs[metric] * test).T.groupby('group').describe() if output_loc is not None: (dfs[metric] * test).to_csv(os.path.join( output_loc, 'filtered_%s.tab' % metric), sep='\t') stats_df = (dfs[metric] * test * passing[metric]).T.describe().T stats_df['range'] = stats_df['max'] - stats_df['min'] stats_df.columns = pd.MultiIndex.from_product([['filtered'], [metric], stats_df.columns]) stats_table.append(stats_df) for metric in metrics: if output_loc is not None: dfs[metric].to_csv(os.path.join(output_loc, 'unfiltered_%s.tab' % metric), sep='\t') stats_df = dfs[metric].T.describe().T stats_df['range'] = stats_df['max'] - stats_df['min'] stats_df.columns = pd.MultiIndex.from_product([['unfiltered'], [metric], stats_df.columns]) stats_table.append(stats_df) stats_table = pd.concat(stats_table, axis=1) if output_loc is not None: stats_table.to_csv(os.path.join(output_loc, 'stats_table.tab'), sep='\t') with open(os.path.join(output_loc, 'stats_table.readme'), 'w') as readme: for var in [ 'dependencies', 'min_peak_height', 'rt_tolerance', 'ppm_tolerance', 'min_msms_score', 'min_num_frag_matches' ]: readme.write('%s\n' % var) try: if np.isinf(eval(var)): pprint.pprint('default', readme) else: pprint.pprint(eval(var), readme) except TypeError: pprint.pprint(eval(var), readme) readme.write('\n') if return_all: return stats_table, dfs, passing
def make_scores_df(metatlas_dataset): """ Returns pandas dataframe with columns 'max_intensity', 'median_rt_shift','median_mz_ppm', 'max_msms_score', 'num_frag_matches', and 'max_relative_frag_intensity', rows of compounds in metatlas_dataset, and values of the best "score" for a given compound across all files. 'max_intensity': highest intensity across all files for given compound 'median_rt_shift': median shift of RT across all files for given compound to reference 'median_mz_ppm': median ppm of mz across all files for given compound relative to reference 'max_msms_score': highest compound dot-product score across all files for given compound relative to reference 'num_frag_matches': number of matching mzs when calculating max_msms_score 'max_relative_frag_intensity': ratio of second highest to first highest intensity of matching sample mzs :param metatlas_dataset: :return scores_df: pandas dataframe """ file_names = ma_data.get_file_names(metatlas_dataset) compound_names = ma_data.get_compound_names(metatlas_dataset)[0] scores = [] msms_hits_df = dp.get_msms_hits( metatlas_dataset, ref_index=['database', 'id', 'inchi_key', 'precursor_mz']) msms_hits_df.reset_index(inplace=True) for compound_idx in range(len(compound_names)): intensities = [] rt_shifts = [] mz_ppms = [] max_msms_score = np.nan num_frag_matches = np.nan max_relative_frag_intensity = np.nan compound_ref_rt_peak = metatlas_dataset[0][compound_idx][ 'identification'].rt_references[0].rt_peak compound_ref_mz = metatlas_dataset[0][compound_idx][ 'identification'].mz_references[0].mz inchi_key = metatlas_dataset[0][compound_idx][ 'identification'].compound[0].inchi_key if len(msms_hits_df) == 0: comp_msms_hits = msms_hits_df else: comp_msms_hits = msms_hits_df[(msms_hits_df['inchi_key'] == metatlas_dataset[0][compound_idx]['identification'].compound[0].inchi_key) \ & ((abs(msms_hits_df['precursor_mz'].values.astype(float) - metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz)/metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz) \ <= metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz_tolerance*1e-6)] for file_idx in range(len(file_names)): try: assert (metatlas_dataset[file_idx][compound_idx]['data'] ['ms1_summary']['peak_height'] > 0) intensities.append(metatlas_dataset[file_idx][compound_idx] ['data']['ms1_summary']['peak_height']) except: # AssertionError: pass try: assert (metatlas_dataset[file_idx][compound_idx]['data'] ['ms1_summary']['num_ms1_datapoints'] > 0) rt_shifts.append( abs(compound_ref_rt_peak - metatlas_dataset[file_idx] [compound_idx]['data']['ms1_summary']['rt_peak'])) mz_ppms.append( 1e6 * (abs(compound_ref_mz - metatlas_dataset[file_idx] [compound_idx]['data']['ms1_summary']['mz_centroid']) / compound_ref_mz)) except: # AssertionError: pass if len(comp_msms_hits['score']) > 0: row = comp_msms_hits.loc[comp_msms_hits['score'].idxmax()] max_msms_score = row['score'] num_frag_matches = row['num_matches'] if num_frag_matches > 1: msv_sample_matches = sp.partition_aligned_ms_vectors( row['msv_query_aligned'], row['msv_ref_aligned'])[0] msv_sample_matches = msv_sample_matches[:, msv_sample_matches[1]. argsort()[::-1]] msv_sample_matches_by_intensity = msv_sample_matches[:, msv_sample_matches[ 1]. argsort()] max_relative_frag_intensity = msv_sample_matches_by_intensity[ 1, -2] / msv_sample_matches_by_intensity[1, -1] try: max_intensity = np.nanmax(intensities) except ValueError: max_intensity = np.nan try: median_rt_shift = np.nanmedian(rt_shifts) except ValueError: median_rt_shift = np.nan try: median_mz_ppm = np.nanmedian(mz_ppms) except ValueError: median_mz_ppm = np.nan # assign scores scores.append([ metatlas_dataset[0][compound_idx] ['identification'].compound[0].name, metatlas_dataset[0] [compound_idx]['identification'].compound[0].inchi_key, max_intensity, median_rt_shift, median_mz_ppm, max_msms_score, num_frag_matches, max_relative_frag_intensity ]) scores_df = pd.DataFrame(scores, columns=[ 'name', 'inchi_key', 'max_intensity', 'median_rt_shift', 'median_mz_ppm', 'max_msms_score', 'num_frag_matches', 'max_relative_frag_intensity' ]) return scores_df