def save_as_mgf(spectrums: List[Spectrum], filename: str): """Save spectrum(s) as mgf file. :py:attr:`~matchms.Spectrum.losses` of spectrum will not be saved. Arguments: ---------- spectrums: Expected input are match.Spectrum.Spectrum() objects. filename: Provide filename to save spectrum(s). """ if not isinstance(spectrums, list): # Assume that input was single Spectrum spectrums = [spectrums] # Convert matchms.Spectrum() into dictionaries for pyteomics for spectrum in spectrums: spectrum_dict = { "m/z array": spectrum.peaks.mz, "intensity array": spectrum.peaks.intensities, "params": spectrum.metadata } # Append spectrum to file py_mgf.write([spectrum_dict], filename)
def format_mgf_deepnovo(mgf_input: str, mgf_output: str): """Format MGF file for use with DeepNovoV2. Necessary spectrum parameters will be reordered to comply with DeepNovoV2 convention. Other parameters will be discarded. Empty spectra will be discarded. Args: mgf_input (str): path to the input MGF file. mgf_output (str): path to the output MGF file. """ key_order = ['title', 'pepmass', 'charge', 'scans', 'rtinseconds'] with mgf.read(mgf_input, read_charges=False) as reader: for spectrum in reader: # Check if spectrum isn't emtpy. if spectrum['m/z array'].size: # Remove unnecessary parameters. to_remove = [c for c in spectrum['params'].keys() if c not in key_order] for col in to_remove: spectrum['params'].pop(col) # Append current spectrum to MGF output with correct params # order. mgf.write((spectrum,), mgf_output, key_order=key_order)
def writeMGF(spectra, outputFile): headers = { 'COM': 'OpenMS_search', 'USERNAME': '******', 'FORMAT': 'Mascot generic', 'TOLU': 'Da', 'ITOLU': 'Da', 'FORMVER': '1.01', 'DB': 'MSDB', 'SEARCH': 'MIS', 'REPORT': 'AUTO', 'CLE': 'Trypsin', 'MASS': 'monoisotopic', 'INSTRUMENT': 'Default', 'PFA': '1', 'TOL': '3', 'ITOL': '0.3', 'TAXONOMY': 'All entries', 'CHARGE': '1,2,3' } mgf.write(spectra=spectra, output=outputFile) return
def convert_mq_mracluster_mgf(mq_msms, mrcluster_clusters, mgf_file, output, px_accession, raw_name): if mq_msms is None or mrcluster_clusters is None or mgf_file is None: print_help() # Read the input spectra input_spectra = mgf.read(mgf_file) spectra_list = list(input_spectra) print('Number of Spectra: ' + str(len(spectra_list))) # Read the msms.txt files using, for now the peptides will be a dictionary, where the key is the scan number # and the values is the peptide sequence. We need to be aware that we can have cases when one scan can be associated with more # than one peptide sequence peptides = read_peptides(mq_msms) print('Number of Peptides: ' + str(len(peptides))) # Read clusters, the clusters will be a map where the key is the scan and the value is the cluster where the scan belongs clusters = read_clusters(mrcluster_clusters) print("Number of Clusters: " + str(len(clusters))) for scan in clusters: print('scan: ' + str(scan)) for spectra in spectra_list: if spectra['params']['title'].endswith('scan=' + str(scan)): cluster_accession = clusters[scan] if scan not in peptides: peptide_sequence = None else: peptide_sequence = peptides[scan] charge = int(spectra['params']['charge'][0]) spectra['params']['title'] = buid_usi_accession( cluster_accession, peptide_sequence, scan, px_accession, raw_name, charge) mgf.write([spectra], output)
def patchMgf(self, input_path, output_path): maxWindowDiff = 156.10112 + 2 * 1.00782503 + 15.9949146 with mgf.read(input_path) as spectra: spectra_out = [] for spectrum in spectra: int_dic = spectrum['intensity array'] mz_dic = spectrum['m/z array'] param_dic = spectrum['params'] chrg_spec = spectrum['params']['charge'][0] pos = 0 del_array = [] for m in mz_dic: if m < 175: # smallest y ion - arginin del_array.append(pos) elif m > spectrum['params']['pepmass'][0] * chrg_spec - ( chrg_spec - 1) * 1.00782503 - maxWindowDiff: del_array.append(pos) pos += 1 int_dic = np.delete(int_dic, del_array, 0) mz_dic = np.delete(mz_dic, del_array, 0) spectra_out.append({ 'm/z array': mz_dic, 'intensity array': int_dic, 'params': param_dic }) mgf.write(spectra=spectra_out, output=output_path)
def annotate_mgf(mgf_input: str, mascot_input: str, mgf_output: str): """Annotate MGF file using Mascot XML results. annotate_mgf will annotate the MGF file using peptide sequences found in the Mascot XML results and write the resulting MGF file to mgf_output. Args: mgf_input (str): path to the MGF input file. mascot_intput (str): path to the Mascot XML results. mgf_output (str): path to the MGF output file. """ # Retrieve mascot sequences. mascot_seq = extract_mascot_sequences(mascot_input) with mgf.read(mgf_input, read_charges=False) as reader: for spectrum in reader: sequences = mascot_seq.loc[ mascot_seq.title == spectrum['params']['title'], 'sequence' ].values # If multiple sequences are associated to a single spectrum, the # latter will be duplicated for each sequence. for seq in sequences: spectrum['params']['seq'] = seq mgf.write((spectrum,), mgf_output)
def export_annotated_spectra_to_mgf(self, mgf_path, report_just_heavy=False): spectra_out = [] for ref in self.references: if report_just_heavy: ms = ref.just_create_heavy_ms() else: ms = ref.create_ms(iMin_similarity=self.min_rel_similarity) buf_peaks = [] buf_int = [] for chrg in ms.spectrum: for key in ms.spectrum[chrg].keys(): for mp in ms.spectrum[chrg][key]: if report_just_heavy: buf_peaks.append(mp.mz) buf_int.append(mp.intensity) else: if mp.meta['mass'] != -1: buf_peaks.append(mp.mz) buf_int.append(mp.intensity) if len(buf_peaks) != 0: spectra_out.append({ 'm/z array': buf_peaks, 'intensity array': buf_int, 'params': ref.params }) mgf.write(spectra=spectra_out, output=mgf_path)
def MGF_generator(df, folder_mzxml, output_filename): temp_spectrum_manager = [] for index, row in df.iterrows(): fn_mzxml_file = folder_mzxml + row['Original_Peaklist'] + '.mzXML' with mzxml.read(fn_mzxml_file) as output: spectrum_1 = str(row['ScanmzXML']) data_spectrum1 = output[spectrum_1] title = str('File:' + row['Original_Peaklist'] + '.' + str(row['ScanmzXML']) + ' "scan=' + str(row['ScanId']) + '"') params_dict = { 'TITLE': title, 'CHARGE': str('1+'), 'PEPMASS': str(row['ExpMz']), 'SCANS': str(row['ScanId']) } dictionnaire = { 'params': params_dict, 'm/z array': data_spectrum1['m/z array'], 'intensity array': data_spectrum1['intensity array'] } temp_spectrum_manager.append(dictionnaire) output_filename = output_filename + '.mgf' mgf.write(temp_spectrum_manager, output_filename)
def load_recalibrate(self): fc = calculate_Delta_by_ppm(self.ppm) tmt_mass = calculate_tag_tmt10() with mgf.read(self.path) as spectra: for spectrum in spectra: ms = MasterSpectrum() params = spectrum['params'] for mass, intensity in zip(spectrum['m/z array'], spectrum['intensity array']): ms.add(Peak(mass, intensity, fc)) peak = Peak(tmt_mass, 0.5, fc) if peak.key() not in ms.spectrum[0]: recalibrate = False else: idx, bin_to_ack, a, b = ms.binary( peak, 0, len(ms.spectrum[0][peak.key()]) - 1, 0) if idx == -1: recalibrate = False else: recalibrate = True recalibration_mass = ms.spectrum[0][peak.key()][idx].mz diff = tmt_mass - recalibration_mass print(params['title']) print("original={0}\tdiff={1}".format( recalibration_mass, diff)) mass_list = [] int_list = [] if recalibrate: ppm_shift = calculate_ppm_shift(diff, tmt_mass) for key in ms.spectrum[0].keys(): for mp in ms.spectrum[0][key]: if recalibrate: if self.type == 'ppm': diff = calculate_da_shift(mp.mz, ppm_shift) mass_list.append(mp.mz + diff) elif self.type == 'absolute': diff = diff mass_list.append(mp.mz + diff) else: print(self.type) raise ValueError("what did you dooooo") else: mass_list.append(mp.mz) int_list.append(mp.intensity) print("len is:\t{0}".format(len(mass_list))) mgf.write(spectra=[{ 'm/z array': mass_list, 'intensity array': int_list, 'params': params }], output=self.file_out)
def clean_mgf_file(file): spectra = mgf.read(file) fasta = 'C:/Users/ccranney/Desktop/Caleb_Files/data/2019-03-14-td-UP000005640.fasta' fDict = {} longPep = '' for record in SeqIO.parse(open(fasta, 'r'), 'fasta'): fDict[len(longPep)] = record.id longPep += str(record.seq) + '.' cleaned = [] count = 0 pepCount = 0 for spec in spectra: count += 1 #if count % 40==0: break #mzValues = return_frag_mzs(spec['params']['seq'],1) #peaks = list(tuple(zip(spec['m/z array'],spec['intensity array']))) #for i in range(len(peaks)-1,-1,-1): # if approx_list(peaks[i][0],mzValues)==-1: peaks.pop(i) #if len(peaks)==0: continue #peaks.sort(key=lambda x:x[0]) #spec['m/z array'],spec['intensity array'] = map(list,zip(*peaks)) #''' decoy = False if 'protein' in spec['params'] and 'DECOY' in spec['params']['protein']: decoy = True else: seq = re.sub(r'\+\d+\.\d+', '', spec['params']['seq']) listOfI = [m.start() for m in re.finditer(seq, longPep)] sorted_keys = sorted(fDict.keys()) proteins = set() for i in listOfI: insertion_point = bisect.bisect_left(sorted_keys, i) # adjust, as bisect returns not exactly what we want if insertion_point == len( sorted_keys) or sorted_keys[insertion_point] != i: insertion_point -= 1 protein = fDict[sorted_keys[insertion_point]] proteins.add(fDict[sorted_keys[insertion_point]]) if len(proteins) == 0: proteins.add(spec['params']['seq']) if decoy: proteins = ['DECOY_0_' + x for x in proteins] protein = str(len(proteins)) + '/' + '/'.join(sorted(proteins)) spec['params']['protein'] = protein if protein != '0/': pepCount += 1 #''' cleaned.append(spec) if count % 1000 == 0: print(count) print(pepCount) print(protein) cleanedFile = re.sub('(.*).mgf', r'\1_proteinsAdded.mgf', file) mgf.write(cleaned, cleanedFile)
def select_mgf(self): spectra_out = [] with mgf.read(self.path_mgf_in) as spectra: for spectrum in spectra: scanid = int(parse_scan_id(spectrum['params']['title'])) if scanid in self.list_chosen: spectra_out.append({'m/z array': spectrum['m/z array'], 'intensity array': spectrum['intensity array'], 'params': spectrum['params']}) mgf.write(spectra=spectra_out, output=self.path_mgf_out)
def convert(args, out=sys.stdout): """Outputs spectral library sorted by mass.""" spectra = [] for oidx, sp in enumerate(mgf.read(args.lib_mgf)): print("Original spectra", sp) spectra.append((sp['params']['pepmass'][0],oidx,sp)) for _,_,s in sorted(spectra): print("Sorted order", s) if s['m/z array'].shape[0] < args.min_peak_count: continue mgf.write([s,], output=out) """
def setUp(self): self.path = 'test.mgf' self.header = mgf.read_header(self.path) self.spectra = list(mgf.read(self.path)) self.tmpfile = tempfile.TemporaryFile(mode='r+') mgf.write(header=self.header, spectra=self.spectra, output=self.tmpfile) self.tmpfile.seek(0) self.header2 = mgf.read_header(self.tmpfile) self.tmpfile.seek(0) tmpreader = mgf.read(self.tmpfile) self.spectra2 = list(tmpreader) self.ns = len(self.spectra) self.tmpfile.close()
def save_mgf_output(merge_result, ms2_file, output_dir, timestamp): #create merged directory and save renamed file out to it output_directory = generate_output_directory_name(ms2_file, output_dir, timestamp) if not os.path.isdir(output_directory): os.makedirs(output_directory) merged_mgf_filename = generate_output_merged_mgf_name( ms2_file, output_dir, timestamp) print("\nWriting merged MGF: " + merged_mgf_filename) mgf.write(merge_result["merged_mgf"], output=merged_mgf_filename, use_numpy=True, write_charges=False, fragment_format='%.4f %.4f')
def clean_mgf_file(mgfFile, fasta, ions=False): spectra = mgf.read(mgfFile) fDict = {} longPep = '' for record in SeqIO.parse(open(fasta,'r'),'fasta'): fDict[len(longPep)] = record.id longPep += str(record.seq) + '.' cleaned = [] count = 0 pepCount = 0 for spec in spectra: count += 1 #if count % 40 == 0: break if ions: mzValues = return_frag_mzs(spec['params']['seq'],1) peaks = list(tuple(zip(spec['m/z array'],spec['intensity array']))) for i in range(len(peaks)-1,-1,-1): if smf.approx_list(peaks[i][0],mzValues)==-1: peaks.pop(i) if len(peaks)==0: continue peaks.sort(key=lambda x:x[0]) spec['m/z array'],spec['intensity array'] = map(list,zip(*peaks)) decoy = False if 'protein' in spec['params'] and 'DECOY' in spec['params']['protein']: decoy = True else: seq = re.sub(r'\+\d+\.\d+', '', spec['params']['seq']) listOfI = [m.start() for m in re.finditer(seq, longPep)] sorted_keys = sorted(fDict.keys()) proteins = set() for i in listOfI: insertion_point = bisect_left(sorted_keys,i) if insertion_point==len(sorted_keys) or sorted_keys[insertion_point]!=i: insertion_point-=1 protein = fDict[sorted_keys[insertion_point]] proteins.add(fDict[sorted_keys[insertion_point]]) if len(proteins)==0: proteins.add('protein_not_in_fasta_'+spec['params']['seq']) if decoy: proteins = ['DECOY_0_'+x for x in proteins] protein = str(len(proteins)) + '/' + '/'.join(sorted(proteins)) if protein != '0/': spec['params']['protein'] = protein; pepCount += 1 cleaned.append(spec) if count % 1000 == 0: print(count); print(pepCount); print(protein) cleanedFile = re.sub('(.*).mgf', r'\1_proteinsAdded.mgf', mgfFile) if ions: cleanedFile = re.sub('(.*).mgf', r'\1_YBionsOnly.mgf', cleanedFile) mgf.write(cleaned, cleanedFile)
def old_mgf_writer(spectrum_data_dict, output_dir, datatype): # deprecated function # write spectrum data to .mgf file processed_sample_params = { key: spectrum_data_dict[key] for key in spectrum_data_dict.keys() if key != 'm/z array' and key != 'intensity array' } processed_sample_spectra = [{ 'm/z array': spectrum_data_dict['m/z array'], 'intensity array': spectrum_data_dict['intensity array'], 'params': processed_sample_params }] pytmgf.write(spectra=processed_sample_spectra, output=output_dir + datatype + "_data.mgf")
def patchMgf(self, input_path, output_path): ''' ''' with mgf.read(input_path) as spectra: spectra_out = [] for spectrum in spectra: int_dic = spectrum['intensity array'] mz_dic = spectrum['m/z array'] param_dic = spectrum['params'] chrg_spec = spectrum['params']['charge'][0] precursor = calculatePrecursor( mz=spectrum['params']['pepmass'][0], charge=chrg_spec) pos = 0 del_array = [] for m in mz_dic: peak = Peak(m, 0, self.delta_function) if peak.key() in self.exclusionSpectrum.spectrum[0]: idx, bin_to_ack, should_merge_left_peak, should_merge_right_peak = self.exclusionSpectrum.binary( peak, 0, len(self.exclusionSpectrum.spectrum[0][peak.key()]) - 1, 0) if idx != -1: # found del_array.append(pos) else: mp = MasterPeak(peak) for precursorDelta in self.precursorDeltas: if mp.isInsideMz(precursor - precursorDelta): del_array.append(pos) else: pass pos += 1 int_dic = np.delete(int_dic, del_array, 0) mz_dic = np.delete(mz_dic, del_array, 0) spectra_out.append({ 'm/z array': mz_dic, 'intensity array': int_dic, 'params': param_dic }) mgf.write(spectra=spectra_out, output=output_path)
def main(): pars = argparse.ArgumentParser() pars.add_argument('input', help='MGF file with clustered spectra.') pars.add_argument('output', nargs='?', help='Output file (default is stdout).') pars.add_argument('--mode', choices=['single', 'encoded_clusters'], default='encoded_clusters', help='Operation mode. Single: input MGF is interpreted as a single cluster.' 'encoded_clusters: cluster IDs are parsed out of spectrum titles.') pars.add_argument('--dyn-range', type=float, default=DYN_RANGE, help='Dynamic range to apply to output spectra') pars.add_argument('--min-fraction', type=float, default=MIN_FRACTION, help='Minimum fraction of cluster spectra where MS/MS peak is present.') pars.add_argument('--mz-accuracy', type=float, default=DIFF_THRESH, help='Minimum distance between MS/MS peak clusters.') pars.add_argument('--append', action='store_true', help='Append to output file instead of replacing it.') pars.add_argument('--rt', choices=['median', 'mass_lower_median'], default='median') pars.add_argument('--pepmass', choices=['naive_average', 'neutral_average', 'lower_median'], default='lower_median') pars.add_argument('--msms_avg', choices=['naive', 'weighted'], default='weighted') args = pars.parse_args() if args.pepmass == 'lower_median': args.rt = 'mass_lower_median' get_rt = {'median': median_rt, 'mass_lower_median': lower_median_mass_rt}[args.rt] get_pepmass = {'naive_average': naive_average_mass_and_charge, 'neutral_average': neutral_average_mass_and_charge, 'lower_median': lower_median_mass}[args.pepmass] kwargs = {'mz_accuracy': args.mz_accuracy, 'dyn_range': args.dyn_range, 'min_fraction': args.min_fraction, 'msms_avg': args.msms_avg} mode = 'wa'[args.append] if args.mode == 'single': spectra = list(mgf.read(args.input)) mz, c = get_pepmass(spectra) rt = get_rt(spectra) mgf.write([average_spectrum(spectra, title=args.output, pepmass=mz, charge=c, rtinseconds=rt, **kwargs)], args.output, file_mode=mode) elif args.mode == 'encoded_clusters': mgf.write(process_maracluster_mgf(args.input, get_pepmass=get_pepmass, get_rt=get_rt, **kwargs), args.output, file_mode=mode) else: raise NotImplementedError('This mode is not implemented yet.')
def seperateMgf(self, folder_out): ''' ''' for idea in self.goodBadUgly.keys(): spectra_out = [] with mgf.read(self.pathMgf) as spectra: for spectrum in spectra: int_dic = spectrum['intensity array'] mz_dic = spectrum['m/z array'] param_dic = spectrum['params'] if spectrum['params']['title'] in self.goodBadUgly[idea]: spectra_out.append({ 'm/z array': mz_dic, 'intensity array': int_dic, 'params': param_dic }) output_path = folder_out + self.file_name + "_" + str( idea) + ".mgf" mgf.write(spectra=spectra_out, output=output_path)
def save_mgf_output(merge_result, ms2_file, output_dir, timestamp): #create merged directory and save renamed file out to it output_directory = generate_output_directory_name(ms2_file, output_dir, timestamp) if not os.path.isdir(output_directory): os.makedirs(output_directory) merged_mgf_filename = generate_output_merged_mgf_name( ms2_file, output_dir, timestamp) utility.print_timestamp("Merge MS2/MS3 - Writing merged MGF - Start - " + basename(merged_mgf_filename)) mgf.write(merge_result["merged_mgf"], output=merged_mgf_filename, use_numpy=True, write_charges=False, fragment_format='%.4f %.4f') utility.print_timestamp( "Merge MS2/MS3 - Writing merged MGF - Complete - " + basename(merged_mgf_filename))
def save_as_mgf(spectrums: List[Spectrum], filename: str): """Save spectrum(s) as mgf file. :py:attr:`~matchms.Spectrum.losses` of spectrum will not be saved. Example: .. code-block:: python import numpy from matchms import Spectrum from matchms.exporting import save_as_mgf # Create dummy spectrum spectrum = Spectrum(mz=numpy.array([100, 200, 300], dtype="float"), intensities=numpy.array([10, 10, 500], dtype="float"), metadata={"charge": -1, "inchi": '"InChI=1S/C6H12"', "precursor_mz": 222.2}) # Write spectrum to test file save_as_mgf(spectrum, "test.mgf") Parameters ---------- spectrums: Expected input are match.Spectrum.Spectrum() objects. filename: Provide filename to save spectrum(s). """ if not isinstance(spectrums, list): # Assume that input was single Spectrum spectrums = [spectrums] # Convert matchms.Spectrum() into dictionaries for pyteomics for spectrum in spectrums: spectrum_dict = { "m/z array": spectrum.peaks.mz, "intensity array": spectrum.peaks.intensities, "params": spectrum.metadata } # Append spectrum to file py_mgf.write([spectrum_dict], filename)
def setUp(self): self.path = 'test.mgf' self.header = mgf.read_header(self.path) with mgf.read(self.path) as f: self.spectra = list(f) self.tmpfile = tempfile.TemporaryFile(mode='r+') mgf.write(header=self.header, spectra=self.spectra, output=self.tmpfile) self.tmpfile.seek(0) self.header2 = mgf.read_header(self.tmpfile) self.tmpfile.seek(0) tmpreader = mgf.read(self.tmpfile) self.spectra2 = list(tmpreader) self.ns = len(self.spectra) self.tmpfile.close() self.path_annotated = 'test_annotated.mgf' self.header_annotated = mgf.read_header(self.path_annotated) with mgf.read(self.path_annotated, read_ions=True) as f: self.spectra_annotated = list(f)
def test_read_write_with_ions(self): formats = ['{:.6f} {:.6f} {}', '%.6f %.6f %s'] for use_numpy in range(2): with tempfile.TemporaryFile(mode='r+') as f: mgf.write(self.spectra_annotated, f, write_ions=True, use_numpy=use_numpy, fragment_format=formats[use_numpy]) f.seek(0) spectra = list(mgf.read(f, read_ions=True)) for spec_data, spec_read in zip(data.mgf_spectra_annotated_long, spectra): # Check that the spectra have the same dict keys self.assertEqual(spec_data.keys(), spec_read.keys()) for key in spec_data.keys(): if type(spec_data[key]) == dict: self.assertDictEqual(spec_data[key], spec_read[key]) else: np.testing.assert_array_equal(spec_data[key], spec_read[key])
def save_as_mgf(spectrums: Union[Spectrum, List[Spectrum]], filename: str): """Save spectrum(s) as mgf file. Args: ---- spectrums: list of Spectrum() objects, Spectrum() object Expected input are match.Spectrum.Spectrum() objects. filename: str Provide filename to save spectrum(s). """ if not isinstance(spectrums, list): # Assume that input was single Spectrum spectrums = [spectrums] # Convert matchms.Spectrum() into dictionaries for pyteomics for spectrum in spectrums: spectrum_dict = { "m/z array": spectrum.peaks.mz, "intensity array": spectrum.peaks.intensities, "params": spectrum.metadata } # Append spectrum to file py_mgf.write([spectrum_dict], filename)
def main(): rt_prediction_df = pd.read_csv(snakemake.input['rt_prediction_output'], sep="\t", index_col=False) with mgf.read(snakemake.input['msms_prediction_output'], read_ions=True, convert_arrays=1, index_by_scans=False) as msms_prediction_mgf: # Split indices into as many chunks as we have cores index_splits = np.array_split( np.asarray(list(msms_prediction_mgf.index.keys())), mp.cpu_count()) # Merge RT with MSMS Chunk-Wise in parallel, then concatenate the results with mp.Pool(mp.cpu_count()) as pool: prediction_mgf_aslist = np.hstack( pool.map( partial(utils.add_rt_to_spectra, msms_prediction_mgf, rt_prediction_df), index_splits)) mgf.write(spectra=prediction_mgf_aslist, output=snakemake.output[0], fragment_format="%.6f %.7f %s", header=msms_prediction_mgf.header, write_charges=False, write_ions=True)
def write_mgf(filename: str, spectra: List[sus.MsmsSpectrum]) -> None: """ Write the given spectra to an MGF file. Parameters ---------- filename : str The file name of the MGF output file. spectra : List[sus.MsmsSpectrum] The spectra to be written to the MGF file. """ spectra_dict = [{ 'm/z array': spectrum.mz, 'intensity array': spectrum.intensity, 'params': { 'title': (f'{spectrum.cluster};' f'{spectrum.identifier}'), 'pepmass': spectrum.precursor_mz, 'rtinseconds': spectrum.retention_time, 'charge': spectrum.precursor_charge } } for spectrum in spectra] with open(filename, 'w') as f_out: mgf.write(spectra_dict, f_out)
import os import re import sys from pyteomics import mgf def natural_sort(s, _nsre=re.compile('([0-9]+)')): return [int(text) if text.isdigit() else text.lower() for text in _nsre.split(s)] mgf_dir = sys.argv[1] if len(sys.argv) > 1 else '.' scan_nr = 1 spectra = [] for filename in sorted(os.listdir(mgf_dir), key=natural_sort): if filename.endswith('.mgf'): print(f'{filename}\t{scan_nr}') for spectrum_dict in mgf.read(os.path.join(mgf_dir, filename), use_index=False): spectrum_dict['params']['scans'] = str(scan_nr) spectra.append(spectrum_dict) scan_nr += 1 f_out = mgf.write(spectra, os.path.join(mgf_dir, 'merged.mgf'), file_mode='w') f_out.close()
lib = lib.dropna(subset=['Smiles']) lib = lib[lib.Smiles != ' '] lib['Smiles'] = lib['Smiles'].str.strip() # remove white spaces ikeys = [j for i in ikeys.values.tolist() for j in i] ikeys = [w.replace('InChIKey=', '') for w in ikeys] smiles["inchikey"] = ikeys smiles = smiles.rename(columns = {'SMILES':'Smiles'}) smiles = smiles.drop_duplicates(subset=['Smiles']) libcomb = pd.merge(lib, smiles,how="left",on="Smiles") libcomb = libcomb.dropna(subset=['inchikey']) libcomb = libcomb.drop_duplicates(subset='inchikey', keep='first', inplace=False) # remove duplicate InChIKeys # load GNPS .mgf file downloaded from https://gnps.ucsd.edu/ProteoSAFe/status.jsp?task=6e22f85aeb0744208e872d1640f508d9 mgf_file = 'ProteoSAFe-METABOLOMICS-SNETS-6e22f85a-download_cluster_buckettable/METABOLOMICS-SNETS-6e22f85a-download_clustered_spectra-main_ChargeReplaced.mgf' scans = libcomb.Scan.tolist() counter=0 with mgf.read(mgf_file) as reader: for spectrum in reader: for idx, scan in enumerate(scans): if spectrum['params']['scans'] == str(scan): file_name = '{}.mgf'.format("GNPSLibraries_uniqueSMILES_withFeatureIDs") spectrum['params']['SMILES'] = libcomb.Smiles.tolist()[idx] spectrum['params']['InchiKey'] = libcomb.inchikey.tolist()[idx] counter+=1 spectrum['params']['FEATURE_ID'] = counter mgf.write((spectrum,), file_name)
def to_mgf(sample_run, output_path=None, header=None, **kwargs): if header is None: header = {} return mgf.write(process_database(sample_run, **kwargs), output_path, header)
# output MGF file output_df = df[df.prediction > 0.5] index_list = output_df['mzxml_index'].astype(int).tolist() indices = [(x - 1) for x in index_list] if len(indices) > 0: steroid_file = file[indices] output_mgf = steroid_file.copy() for i in range(len(output_mgf)): output_mgf[i] = removekey(steroid_file[i], [ 'num', 'centroided', 'retentionTime', 'polarity', 'msLevel', 'collisionEnergy', 'peaksCount', 'lowMz', 'highMz', 'basePeakMz', 'basePeakIntensity', 'totIonCurrent', 'precursorMz', 'id' ]) output_mgf[i]['params'] = dict([ ('title', steroid_file[i]['num']), ('rtinseconds', 60 * steroid_file[i]['retentionTime']), ('pepmass', steroid_file[i]['precursorMz'][0]['precursorMz']), ('charge', '0+') ]) os.chdir(output_dir) mgf_name = 'steroid_' + mzxml_file[0:-6] + '.mgf' mgf.write(output_mgf, output=mgf_name, key_order=['title', 'rtinseconds', 'pepmass', 'charge'], write_charges=True) print('File completed')
spec_id = spectrum['params']['spectrumid'] spec_inchi = spectrum['params']['inchikey'] spec_smiles = spectrum['params']['smiles'] for mibig_ids, mibig_inchi, mibig_smiles in mibig_entries: mibig_id, mibig_name, ext_id = mibig_ids output_list = [ spec_id, spec_inchi, spec_smiles, mibig_id, mibig_name, mibig_inchi, mibig_smiles ] fwriter.writerow(output_list) #print(output_string) # In[8]: # Write a MGF file with the matched spectra mgf.write([x[0] for x in matches], 'matched_mibig_gnps_update.mgf') # In[14]: matches[0][0]['params'] # In[24]: matches_with_bgc_id = [] for ms, bgc in matches: bgc_id = bgc[0][0][0] spectrum_id = ms['params']['spectrumid'] compound_id = '.'.join((bgc_id, spectrum_id)) ms['params']['BGCID'] = compound_id matches_with_bgc_id.append(ms)