Python partition_aligned_ms_vectors Exemples

Langage de programmation: Python

Espace de nommage/Pack: metatlas.helpers.spectralprocessing

Méthode/Fonction: partition_aligned_ms_vectors

Exemples au hotexamples.com: 2

Python partition_aligned_ms_vectors - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de metatlas.helpers.spectralprocessing.partition_aligned_ms_vectors extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Exemple #1

0

Afficher le fichier

def make_scores_df(metatlas_dataset): """ Returns pandas dataframe with columns 'max_intensity', 'median_rt_shift','median_mz_ppm', 'max_msms_score', 'num_frag_matches', and 'max_relative_frag_intensity', rows of compounds in metatlas_dataset, and values of the best "score" for a given compound across all files. 'max_intensity': highest intensity across all files for given compound 'median_rt_shift': median shift of RT across all files for given compound to reference 'median_mz_ppm': median ppm of mz across all files for given compound relative to reference 'max_msms_score': highest compound dot-product score across all files for given compound relative to reference 'num_frag_matches': number of matching mzs when calculating max_msms_score 'max_relative_frag_intensity': ratio of second highest to first highest intensity of matching sample mzs :param metatlas_dataset: :return scores_df: pandas dataframe """ file_names = ma_data.get_file_names(metatlas_dataset) compound_names = ma_data.get_compound_names(metatlas_dataset)[0] scores = [] msms_hits_df = dp.get_msms_hits( metatlas_dataset, ref_index=['database', 'id', 'inchi_key', 'precursor_mz']) msms_hits_df.reset_index(inplace=True) for compound_idx in range(len(compound_names)): intensities = [] rt_shifts = [] mz_ppms = [] max_msms_score = np.nan num_frag_matches = np.nan max_relative_frag_intensity = np.nan compound_ref_rt_peak = metatlas_dataset[0][compound_idx][ 'identification'].rt_references[0].rt_peak compound_ref_mz = metatlas_dataset[0][compound_idx][ 'identification'].mz_references[0].mz inchi_key = metatlas_dataset[0][compound_idx][ 'identification'].compound[0].inchi_key if len(msms_hits_df) == 0: comp_msms_hits = msms_hits_df else: comp_msms_hits = msms_hits_df[(msms_hits_df['inchi_key'] == metatlas_dataset[0][compound_idx]['identification'].compound[0].inchi_key) \ & ((abs(msms_hits_df['precursor_mz'].values.astype(float) - metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz)/metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz) \ <= metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz_tolerance*1e-6)] for file_idx in range(len(file_names)): try: assert (metatlas_dataset[file_idx][compound_idx]['data'] ['ms1_summary']['peak_height'] > 0) intensities.append(metatlas_dataset[file_idx][compound_idx] ['data']['ms1_summary']['peak_height']) except: # AssertionError: pass try: assert (metatlas_dataset[file_idx][compound_idx]['data'] ['ms1_summary']['num_ms1_datapoints'] > 0) rt_shifts.append( abs(compound_ref_rt_peak - metatlas_dataset[file_idx] [compound_idx]['data']['ms1_summary']['rt_peak'])) mz_ppms.append( 1e6 * (abs(compound_ref_mz - metatlas_dataset[file_idx] [compound_idx]['data']['ms1_summary']['mz_centroid']) / compound_ref_mz)) except: # AssertionError: pass if len(comp_msms_hits['score']) > 0: row = comp_msms_hits.loc[comp_msms_hits['score'].idxmax()] max_msms_score = row['score'] num_frag_matches = row['num_matches'] if num_frag_matches > 1: msv_sample_matches = sp.partition_aligned_ms_vectors( row['msv_query_aligned'], row['msv_ref_aligned'])[0] msv_sample_matches = msv_sample_matches[:, msv_sample_matches[1]. argsort()[::-1]] msv_sample_matches_by_intensity = msv_sample_matches[:, msv_sample_matches[ 1]. argsort()] max_relative_frag_intensity = msv_sample_matches_by_intensity[ 1, -2] / msv_sample_matches_by_intensity[1, -1] try: max_intensity = np.nanmax(intensities) except ValueError: max_intensity = np.nan try: median_rt_shift = np.nanmedian(rt_shifts) except ValueError: median_rt_shift = np.nan try: median_mz_ppm = np.nanmedian(mz_ppms) except ValueError: median_mz_ppm = np.nan # assign scores scores.append([ metatlas_dataset[0][compound_idx] ['identification'].compound[0].name, metatlas_dataset[0] [compound_idx]['identification'].compound[0].inchi_key, max_intensity, median_rt_shift, median_mz_ppm, max_msms_score, num_frag_matches, max_relative_frag_intensity ]) scores_df = pd.DataFrame(scores, columns=[ 'name', 'inchi_key', 'max_intensity', 'median_rt_shift', 'median_mz_ppm', 'max_msms_score', 'num_frag_matches', 'max_relative_frag_intensity' ]) return scores_df

Exemple #2

0

Afficher le fichier

def make_stats_table( input_fname='', input_dataset=[], msms_hits_df=None, include_lcmsruns=[], exclude_lcmsruns=[], include_groups=[], exclude_groups=[], output_loc=None, msms_hits=None, min_peak_height=0, min_num_data_points=0, rt_tolerance=np.inf, ppm_tolerance=np.inf, min_msms_score=0, min_num_frag_matches=0, allow_no_msms=False, min_relative_frag_intensity=None, use_labels=False, return_all=False, msms_refs_loc='/project/projectdirs/metatlas/projects/spectral_libraries/msms_refs_v2.tab', dependencies={ 'peak_height': [], 'peak_area': ['peak_height'], 'num_data_points': ['peak_height'], 'rt_peak': ['peak_height', 'rt_delta'], 'rt_delta': ['peak_height'], 'mz_centroid': ['peak_height', 'mz_ppm'], 'mz_ppm': ['peak_height'], 'msms_score': ['peak_height', 'num_frag_matches'], 'num_frag_matches': ['peak_height', 'msms_score'] }): assert output_loc is not None or return_all if not input_dataset: metatlas_dataset = ma_data.get_dill_data( os.path.expandvars(input_fname)) else: metatlas_dataset = input_dataset if output_loc is not None and not os.path.exists(output_loc): os.mkdir(output_loc) # filter runs from the metatlas dataset if include_lcmsruns: metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_include_list( metatlas_dataset, 'lcmsrun', include_lcmsruns) if include_groups: metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_include_list( metatlas_dataset, 'group', include_groups) if exclude_lcmsruns: metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_exclude_list( metatlas_dataset, 'lcmsrun', exclude_lcmsruns) if exclude_groups: metatlas_dataset = dp.filter_lcmsruns_in_dataset_by_exclude_list( metatlas_dataset, 'group', exclude_groups) final_df = pd.DataFrame(columns=['index']) file_names = ma_data.get_file_names(metatlas_dataset) compound_names = ma_data.get_compound_names(metatlas_dataset, use_labels=use_labels)[0] metrics = [ 'msms_score', 'num_frag_matches', 'mz_centroid', 'mz_ppm', 'rt_peak', 'rt_delta', 'peak_height', 'peak_area', 'num_data_points' ] dfs = {m: None for m in metrics} passing = { m: np.ones((len(compound_names), len(file_names))).astype(float) for m in metrics } for metric in ['peak_height', 'peak_area', 'rt_peak', 'mz_centroid']: dfs[metric] = dp.make_output_dataframe(input_dataset=metatlas_dataset, fieldname=metric, use_labels=use_labels) dfs['mz_ppm'] = dfs['peak_height'].copy() dfs['mz_ppm'] *= np.nan dfs['num_data_points'] = pd.DataFrame([[ len(metatlas_dataset[i][j]['data']['eic']['intensity']) for i in range(len(metatlas_dataset)) ] for j in range(len(metatlas_dataset[0]))]) dfs['num_data_points'].index = dfs['mz_ppm'].index dfs['msms_score'] = dfs['mz_ppm'].copy() dfs['num_frag_matches'] = dfs['mz_ppm'].copy() dfs['rt_delta'] = dfs['mz_ppm'].copy() passing['peak_height'] = (np.nan_to_num(dfs['peak_height'].values) >= min_peak_height).astype(float) passing['num_data_points'] = (np.nan_to_num(dfs['num_data_points'].values) >= min_num_data_points).astype(float) #msms_hits_df = dp.get_msms_hits(metatlas_dataset, use_labels, ref_index=['database', 'id', 'inchi_key', 'precursor_mz']) #msms_hits_df = dp.get_msms_hits(metatlas_dataset, use_labels, ref_index=['database', 'id', 'inchi_key']) #msms_hits_df.rename(columns={'inchi_key':'inchi_key_2'},inplace=True) msms_hits_df = msms_hits.copy() msms_hits_df.reset_index(inplace=True) for compound_idx, compound_name in enumerate(compound_names): ref_rt_peak = metatlas_dataset[0][compound_idx][ 'identification'].rt_references[0].rt_peak ref_mz = metatlas_dataset[0][compound_idx][ 'identification'].mz_references[0].mz dfs['rt_delta'].iloc[compound_idx] = abs( ref_rt_peak - dfs['rt_peak'].iloc[compound_idx]) passing['rt_delta'][compound_idx] = ( abs(ref_rt_peak - np.nan_to_num(dfs['rt_peak'].iloc[compound_idx].values)) <= rt_tolerance).astype(float) dfs['mz_ppm'].iloc[compound_idx] = 1e6 * ( abs(ref_mz - dfs['mz_centroid'].iloc[compound_idx]) / ref_mz) passing['mz_ppm'][compound_idx] = ( dfs['mz_ppm'].iloc[compound_idx].values <= ppm_tolerance).astype(float) try: inchi_key = metatlas_dataset[0][compound_idx][ 'identification'].compound[0].inchi_key except: inchi_key = '' compound_ref_rt_min = metatlas_dataset[0][compound_idx][ 'identification'].rt_references[0].rt_min compound_ref_rt_max = metatlas_dataset[0][compound_idx][ 'identification'].rt_references[0].rt_max cid = metatlas_dataset[0][compound_idx]['identification'] mz_theoretical = cid.mz_references[0].mz mz_measured = metatlas_dataset[0][compound_idx]['data']['ms1_summary'][ 'mz_centroid'] delta_mz = abs(mz_theoretical - mz_measured) delta_ppm = delta_mz / mz_theoretical * 1e6 comp_msms_hits = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key) \ & (msms_hits_df['msms_scan'] >= compound_ref_rt_min) \ & (msms_hits_df['msms_scan'] <= compound_ref_rt_max) \ & ((abs(msms_hits_df['measured_precursor_mz'].values.astype(float) - mz_theoretical)/mz_theoretical) \ <= cid.mz_references[0].mz_tolerance*1e-6)] comp_msms_hits = comp_msms_hits.sort_values('score', ascending=False) file_idxs, scores, msv_sample_list, msv_ref_list, rt_list = [], [], [], [], [] if len(comp_msms_hits) > 0 and not np.isnan( np.concatenate(comp_msms_hits['msv_ref_aligned'].values, axis=1)).all(): file_idxs = [ file_names.index(f) for f in comp_msms_hits['file_name'] ] scores = comp_msms_hits['score'].values.tolist() msv_sample_list = comp_msms_hits[ 'msv_query_aligned'].values.tolist() msv_ref_list = comp_msms_hits['msv_ref_aligned'].values.tolist() rt_list = comp_msms_hits['msms_scan'].values.tolist() mz_sample_matches = sp.partition_aligned_ms_vectors( msv_sample_list[0], msv_ref_list[0])[0][0].tolist() avg_mz_measured = [] avg_rt_measured = [] intensities = pd.DataFrame() for file_idx, file_name in enumerate(file_names): if not np.isnan(metatlas_dataset[file_idx][compound_idx]['data'] ['ms1_summary']['mz_centroid']): avg_mz_measured.append(metatlas_dataset[file_idx][compound_idx] ['data']['ms1_summary']['mz_centroid']) if not np.isnan(metatlas_dataset[file_idx][compound_idx]['data'] ['ms1_summary']['rt_peak']): avg_rt_measured.append(metatlas_dataset[file_idx][compound_idx] ['data']['ms1_summary']['rt_peak']) if not np.isnan(metatlas_dataset[file_idx][compound_idx]['data'] ['ms1_summary']['peak_height']): intensities.loc[file_idx, 'file_id'] = file_idx intensities.loc[ file_idx, 'intensity'] = metatlas_dataset[file_idx][ compound_idx]['data']['ms1_summary']['peak_height'] avg_mz_measured = np.mean(avg_mz_measured) avg_rt_measured = np.mean(avg_rt_measured) final_df = final_df.append({'index': compound_idx}, ignore_index=True) final_df.loc[compound_idx, 'identified_metabolite'] = "" if use_labels or len(cid.compound) == 0: cid_label = cid.name final_df.loc[compound_idx, 'label'] = cid_label else: cid_label = cid.compound[0].name final_df.loc[compound_idx, 'label'] = cid_label overlapping_compounds = [] inchi_key_map = {} #Loop through compounds to identify overlapping compounds for compound_iterator in range(len(compound_names)): if len(metatlas_dataset[0][compound_iterator] ['identification'].compound) == 0: continue if use_labels: cpd_iter_label = metatlas_dataset[0][compound_iterator][ 'identification'].name else: cpd_iter_label = metatlas_dataset[0][compound_iterator][ 'identification'].compound[0].name cpd_iter_id = metatlas_dataset[0][compound_iterator][ 'identification'] cpd_iter_mz = cpd_iter_id.mz_references[0].mz cid_mass = cid.compound[0].mono_isotopic_molecular_weight cpd_iter_mass = cpd_iter_id.compound[ 0].mono_isotopic_molecular_weight cid_rt_min = cid.rt_references[0].rt_min cid_rt_max = cid.rt_references[0].rt_max cpd_iter_rt_min = cpd_iter_id.rt_references[0].rt_min cpd_iter_rt_max = cpd_iter_id.rt_references[0].rt_max if compound_idx != compound_iterator: if ((cpd_iter_mz-0.005 <= mz_theoretical <= cpd_iter_mz+0.005) or (cpd_iter_mass-0.005 <= cid_mass <= cpd_iter_mass+0.005)) and \ ((cpd_iter_rt_min <= cid_rt_min <=cpd_iter_rt_max) or (cpd_iter_rt_min <= cid_rt_max <= cpd_iter_rt_max) or \ (cid_rt_min <= cpd_iter_rt_min <= cid_rt_max) or (cid_rt_min <= cpd_iter_rt_max <= cid_rt_max)): overlapping_compounds.append(cpd_iter_label) inchi_key_map[cpd_iter_label] = cpd_iter_id.compound[ 0].inchi_key if len(overlapping_compounds) > 0: overlapping_compounds.append(cid_label) inchi_key_map[cid_label] = cid.compound[0].inchi_key final_df.loc[compound_idx, 'overlapping_compound'] = "//".join( cpd for cpd in sorted(overlapping_compounds, key=str)) final_df.loc[compound_idx, 'overlapping_inchi_keys'] = "//".join( inchi_key_map[cpd] for cpd in sorted(overlapping_compounds, key=str)) else: final_df.loc[compound_idx, 'overlapping_compound'] = "" final_df.loc[compound_idx, 'overlapping_inchi_keys'] = "" if len(cid.compound) == 0: final_df.loc[compound_idx, 'formula'] = "" final_df.loc[compound_idx, 'polarity'] = cid.mz_references[0].detected_polarity final_df.loc[compound_idx, 'exact_mass'] = "" final_df.loc[compound_idx, 'inchi_key'] = "" else: final_df.loc[compound_idx, 'formula'] = cid.compound[0].formula final_df.loc[compound_idx, 'polarity'] = cid.mz_references[0].detected_polarity final_df.loc[ compound_idx, 'exact_mass'] = cid.compound[0].mono_isotopic_molecular_weight final_df.loc[compound_idx, 'inchi_key'] = cid.compound[0].inchi_key final_df.loc[compound_idx, 'msms_quality'] = "" final_df.loc[compound_idx, 'mz_quality'] = "" final_df.loc[compound_idx, 'rt_quality'] = "" final_df.loc[compound_idx, 'total_score'] = "" final_df.loc[compound_idx, 'msi_level'] = "" final_df.loc[compound_idx, 'isomer_details'] = "" final_df.loc[compound_idx, 'identification_notes'] = cid.description if len(intensities) > 0: final_df.loc[compound_idx, 'max_intensity'] = intensities.loc[ intensities['intensity'].idxmax()]['intensity'] final_df.loc[compound_idx, 'max_intensity_file'] = file_names[int( intensities.loc[intensities['intensity'].idxmax()]['file_id'])] else: final_df.loc[compound_idx, 'max_intensity'] = "" final_df.loc[compound_idx, 'max_intensity_file'] = "" if file_idxs != []: final_df.loc[compound_idx, 'msms_file'] = file_names[file_idxs[0]] final_df.loc[compound_idx, 'msms_rt'] = float("%.2f" % rt_list[0]) final_df.loc[compound_idx, 'msms_numberofions'] = len(mz_sample_matches) final_df.loc[compound_idx, 'msms_matchingions'] = ','.join( ['%5.3f' % m for m in mz_sample_matches]) if len(mz_sample_matches) == 1: # Set score to zero when there is only one matching ion. precursor intensity is set as score in such cases and need to be set to 0 for final identification. final_df.loc[compound_idx, 'msms_score'] = 0.0 else: final_df.loc[compound_idx, 'msms_score'] = float("%.4f" % scores[0]) else: final_df.loc[compound_idx, 'msms_file'] = "" final_df.loc[compound_idx, 'msms_rt'] = "" final_df.loc[compound_idx, 'msms_numberofions'] = "" final_df.loc[compound_idx, 'msms_matchingions'] = "" final_df.loc[compound_idx, 'msms_score'] = "" final_df.loc[compound_idx, 'mz_adduct'] = cid.mz_references[0].adduct final_df.loc[compound_idx, 'mz_theoretical'] = float("%.4f" % mz_theoretical) final_df.loc[compound_idx, 'mz_measured'] = float("%.4f" % avg_mz_measured) final_df.loc[compound_idx, 'mz_error'] = float( "%.4f" % abs(mz_theoretical - avg_mz_measured)) final_df.loc[compound_idx, 'mz_ppmerror'] = float( "%.4f" % (abs(mz_theoretical - avg_mz_measured) / mz_theoretical * 1e6)) final_df.loc[compound_idx, 'rt_min'] = float("%.2f" % compound_ref_rt_min) final_df.loc[compound_idx, 'rt_max'] = float("%.2f" % compound_ref_rt_max) final_df.loc[compound_idx, 'rt_theoretical'] = float( "%.2f" % cid.rt_references[0].rt_peak) final_df.loc[compound_idx, 'rt_measured'] = float("%.2f" % avg_rt_measured) final_df.loc[compound_idx, 'rt_error'] = float( "%.2f" % abs(cid.rt_references[0].rt_peak - avg_rt_measured)) for file_idx, file_name in enumerate(file_names): rows = msms_hits_df[(msms_hits_df['inchi_key'] == inchi_key) & \ (msms_hits_df['file_name'] == file_name) & \ (msms_hits_df['msms_scan'] >= compound_ref_rt_min) & (msms_hits_df['msms_scan'] <= compound_ref_rt_max) & \ ((abs(msms_hits_df['measured_precursor_mz'].values.astype(float) - metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz)/metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz) \ <= metatlas_dataset[0][compound_idx]['identification'].mz_references[0].mz_tolerance*1e-6)] if len(rows) == 0: dfs['msms_score'].iat[compound_idx, file_idx] = np.nan dfs['num_frag_matches'].iat[compound_idx, file_idx] = np.nan else: if not np.isnan( np.concatenate(rows['msv_ref_aligned'].values, axis=1)).all(): dfs['msms_score'].iat[compound_idx, file_idx] = rows.loc[ rows['score'].idxmax()]['score'] dfs['num_frag_matches'].iat[compound_idx, file_idx] = rows.loc[ rows['score'].idxmax()]['num_matches'] passing['msms_score'] = (np.nan_to_num(dfs['msms_score'].values) >= min_msms_score).astype(float) passing['num_frag_matches'] = (np.nan_to_num( dfs['num_frag_matches'].values) >= min_num_frag_matches).astype(float) writer = pd.ExcelWriter(os.path.join(output_loc, 'Draft_Final_Idenfications.xlsx'), engine='xlsxwriter') final_df.to_excel(writer, sheet_name='Final_Identifications', index=False, startrow=3) #set format workbook = writer.book f_blue = workbook.add_format({'bg_color': '#DCFFFF'}) f_yellow = workbook.add_format({'bg_color': '#FFFFDC'}) f_rose = workbook.add_format({'bg_color': '#FFDCFF'}) cell_format = workbook.add_format({'bold': True, 'align': 'center'}) scientific_format = workbook.add_format({'num_format': '0.00E+00'}) cell_format.set_text_wrap() cell_format.set_border() worksheet = writer.sheets['Final_Identifications'] worksheet.set_row(1, 60) worksheet.set_row(2, 60) worksheet.set_column('Q:Q', None, scientific_format) worksheet.merge_range('A1:I1', 'COMPOUND ANNOTATION', cell_format) worksheet.merge_range('J1:P1', 'COMPOUND IDENTIFICATION SCORES', cell_format) worksheet.merge_range('Q1:R1', 'MS1 INTENSITY INFORMATION', cell_format) worksheet.merge_range('S1:V1', 'MSMS INFORMATION', cell_format) worksheet.write('W1', 'MSMS EVALUATION', cell_format) worksheet.merge_range('X1:Z1', 'ION INFORMATION', cell_format) worksheet.merge_range('AA1:AB1', 'M/Z EVALUATION', cell_format) worksheet.merge_range('AC1:AF1', 'CHROMATOGRAPHIC PEAK INFORMATION', cell_format) worksheet.write('AG1', 'RT EVALUATION', cell_format) HEADER2 = [ 'Compound #', 'Identified Metabolite', 'Name of metabolite searched for', 'Labels of Overlapping Compounds', 'Inchi Keys of Overlapping Compounds', 'Molecular Formula', 'Polarity', 'Exact Mass', 'Inchi Key', 'MSMS Score (0 to 1)', 'm/z score (0 to 1)', 'RT score (0 to 1)', 'Total ID Score (0 to 3)', 'Mass Spec Inititative Identification Level', 'Isomer details', 'Identification notes', 'Maximum MS1 intensity across all files', 'Maximum MS1 intensity across all files', 'File with highest MSMS match score', 'RT of highest matched MSMS scan', 'Number of ion matches in msms spectra to EMA reference spectra', 'List of ion matches in msms spectra to EMA reference spectra', '', 'Adduct', 'Theoretical m/z', 'Measured m/z', 'mass error (delta Da)', 'mass error (delta ppm)', 'Minimum retention time (min.)', 'Maximum retention time (min.)', 'Theoretical retention time (min.)', 'Detected + averaged RT (min.)', 'RT error (absolute delta min.)' ] for i, header in enumerate(HEADER2): worksheet.write(1, i, header, cell_format) HEADER3 = [ 'Unique for study', 'Some isomers are not chromatographically or spectrally resolvable.', 'Name of standard reference compound in library match.', 'compound with similar mz (abs difference <= 0.005) or monoisotopic molecular weight (abs difference <= 0.005) and RT (min or max within the RT-min-max-range of similar compound)', 'List of inchi keys that correspond to the compounds listed in the previous column', '', '', 'monoisotopic mass (neutral except for permanently charged molecules)', 'neutralized version', '1 (MSMS matches ref. std.), 0.5 (possible match), 0 (no MSMS collected or no appropriate ref available), -1 (bad match)', '1 (delta ppm </= 5 or delta Da </= 0.001), 0.5 (delta ppm 5-10 and delta Da > 0.001), 0 (delta ppm > 10) 1 (delta ppm </= 15 or delta Da </= 0.005), 0.5 (delta ppm 15-20 and delta Da > 0.001), 0 (delta ppm > 20) (NOTE: neg mass accuracy is not as good as pos)', '1 (delta RT </= 0.5), 0.5 (delta RT > 0.5 & </= 2), 0 (delta RT > 2 min)', 'sum of m/z, RT and MSMS score', 'Level 1 = Two independent and orthogonal properties match authentic standard; else = putative [Metabolomics. 2007 Sep; 3(3): 211-221. doi: 10.1007/s11306-007-0082-2]', 'Isomers have same formula (and m/z) and similar RT - MSMS spectra may be used to differentiate (exceptions) or RT elution order', '', '', '', '', '', 'mean # of fragment ions matching between compound in sample and reference compound / standard; may include parent and isotope ions and very low intensity background ions (these do not contribute to score)', '', 'MSMS score (highest across all samples), scale of 0 to 1 based on an algorithm. 0 = no match, 1 = perfect match. If no score, then no MSMS was acquired for that compound (@ m/z & RT window).', 'More than one may be detectable; the one evaluated is listed', 'theoretical m/z for a given compound / adduct pair', 'average m/z within 20ppm of theoretical detected across all samples @ RT peak', 'absolute difference between theoretical and detected m/z', 'ppm difference between theoretical and detected m/z', '', '', 'theoretical retention time for a compound based upon reference standard at highest intensity point of peak', 'average retention time for a detected compound at highest intensity point of peak across all samples', 'absolute difference between theoretical and detected RT peak' ] for i, header in enumerate(HEADER3): worksheet.write(2, i, header, cell_format) worksheet.merge_range( 'AC3:AD3', 'Retention range including start and end of detection of an m/z value (Note: Peak Height is calculated as the highest intensity of an m/z within the min/max RT range. Peak Area is calculated as the integrated area under the curve for an m/z within the mix/max RT range.)', cell_format) worksheet.conditional_format('J1:P' + str(len(final_df) + 4), { 'type': 'no_errors', 'format': f_blue }) worksheet.conditional_format('Q1:V' + str(len(final_df) + 4), { 'type': 'no_errors', 'format': f_yellow }) worksheet.conditional_format('W1:W' + str(len(final_df) + 4), { 'type': 'no_errors', 'format': f_rose }) worksheet.conditional_format('X1:Z' + str(len(final_df) + 4), { 'type': 'no_errors', 'format': f_yellow }) worksheet.conditional_format('AA1:AB' + str(len(final_df) + 4), { 'type': 'no_errors', 'format': f_rose }) worksheet.conditional_format('AC1:AF' + str(len(final_df) + 4), { 'type': 'no_errors', 'format': f_yellow }) worksheet.conditional_format('AG1:AG' + str(len(final_df) + 4), { 'type': 'no_errors', 'format': f_rose }) writer.save() #final_df.to_csv(os.path.join(output_loc, 'Draft_Final_Idenfications.tab'), sep='\t') for metric in metrics: passing[metric][passing[metric] == 0] = np.nan stats_table = [] for metric in metrics: test = np.product(np.array( [passing[dep] for dep in dependencies[metric]]), axis=0) # group_df = (dfs[metric] * test).T.groupby('group').describe() if output_loc is not None: (dfs[metric] * test).to_csv(os.path.join( output_loc, 'filtered_%s.tab' % metric), sep='\t') stats_df = (dfs[metric] * test * passing[metric]).T.describe().T stats_df['range'] = stats_df['max'] - stats_df['min'] stats_df.columns = pd.MultiIndex.from_product([['filtered'], [metric], stats_df.columns]) stats_table.append(stats_df) for metric in metrics: if output_loc is not None: dfs[metric].to_csv(os.path.join(output_loc, 'unfiltered_%s.tab' % metric), sep='\t') stats_df = dfs[metric].T.describe().T stats_df['range'] = stats_df['max'] - stats_df['min'] stats_df.columns = pd.MultiIndex.from_product([['unfiltered'], [metric], stats_df.columns]) stats_table.append(stats_df) stats_table = pd.concat(stats_table, axis=1) if output_loc is not None: stats_table.to_csv(os.path.join(output_loc, 'stats_table.tab'), sep='\t') with open(os.path.join(output_loc, 'stats_table.readme'), 'w') as readme: for var in [ 'dependencies', 'min_peak_height', 'rt_tolerance', 'ppm_tolerance', 'min_msms_score', 'min_num_frag_matches' ]: readme.write('%s\n' % var) try: if np.isinf(eval(var)): pprint.pprint('default', readme) else: pprint.pprint(eval(var), readme) except TypeError: pprint.pprint(eval(var), readme) readme.write('\n') if return_all: return stats_table, dfs, passing