def calculate_ndp_time(spectra_mgf_file1, spectra_mgf_file2): score_list = [] bins_spectrum_01, bins_spectrum_02 = [], [] tmp_time_01 = time.perf_counter() spectra01 = read(spectra_mgf_file1, convert_arrays=1) spectra02 = read(spectra_mgf_file2, convert_arrays=1) for data01 in spectra01: spectrum01_mz_array = data01.get("m/z array") spectrum01_intens_array = data01.get("intensity array") bin_spectrum01 = ndp_bin_spectrum(spectrum01_mz_array, spectrum01_intens_array) bins_spectrum_01.append(bin_spectrum01) for data02 in spectra02: spectrum02_mz_array = data02.get("m/z array") spectrum02_intens_array = data02.get("intensity array") bin_spectrum02 = ndp_bin_spectrum(spectrum02_mz_array, spectrum02_intens_array) bins_spectrum_02.append(bin_spectrum02) time01 = time.perf_counter() print("两文件编码所用的时间为:{}".format(time01 - tmp_time_01)) for j in range(len(bins_spectrum_01)): score = caculate_nornalization_dp(bins_spectrum_01[j], bins_spectrum_02[j]) score_list.append(score) # np.savetxt("./data/1130_test_use_time_ndp.txt", score_list) time02 = time.perf_counter() print("Similarity use time: {}".format(time02 - time01))
def test_read_array_conversion(self): with mgf.read(self.path, convert_arrays=0) as reader: self.assertEqual(data.mgf_spectra_lists, list(reader)) with mgf.read(self.path, convert_arrays=2) as reader: s = next(reader) self.assertTrue( isinstance(s['charge array'], np.ma.core.MaskedArray)) self.assertTrue(isinstance(s['m/z array'], np.ndarray)) with mgf.read(self.path, convert_arrays=1) as reader: s = next(reader) self.assertTrue(isinstance(s['charge array'], np.ndarray)) self.assertTrue(isinstance(s['m/z array'], np.ndarray))
def main(cluster_file, consensus_file): with mgf.read(consensus_file) as reader: for spectrum_dict in reader: peptide_seq = spectrum_dict['params']['title'] precursor_mz = spectrum_dict['params']['pepmass'][0] precursor_charge = spectrum_dict['params']['charge'][0] cons_mz = spectrum_dict['m/z array'] cons_intensity = spectrum_dict['intensity array'] retention_time = float(spectrum_dict['params']['rtinseconds']) break cons_spec = sus.MsmsSpectrum(peptide_seq, precursor_mz=precursor_mz, precursor_charge=precursor_charge, mz=cons_mz, intensity=cons_intensity, retention_time=retention_time, peptide=peptide_seq) with mgf.read(cluster_file) as reader: for spectrum_dict in reader: precursor_mz = spectrum_dict['params']['pepmass'][0] precursor_charge = spectrum_dict['params']['charge'][0] mz = spectrum_dict['m/z array'] intensity = spectrum_dict['intensity array'] retention_time = float(spectrum_dict['params']['rtinseconds']) spectrum = sus.MsmsSpectrum(peptide_seq, precursor_mz=precursor_mz, precursor_charge=precursor_charge, mz=mz, intensity=intensity, retention_time=retention_time, peptide=peptide_seq) # Process the MS/MS spectrum. fragment_tol_mass = 10 fragment_tol_mode = 'ppm' # fragment_tol_mass = .5 # fragment_tol_mode = 'Da' spectrum = (spectrum.set_mz_range( min_mz=100, max_mz=1400).remove_precursor_peak( fragment_tol_mass, fragment_tol_mode).filter_intensity( min_intensity=0.05, max_num_peaks=50).scale_intensity( 'root').annotate_peptide_fragments(fragment_tol_mass, fragment_tol_mode, ion_types='aby')) # Generate theoretical spec # Plot the MS/MS spectrum. fig, ax = plt.subplots(figsize=(12, 6)) # sup.spectrum(spectrum, ax=ax) sup.mirror(spectrum, tspec, ax=ax) plt.show() plt.close()
def setUp(self): self.path = 'test.mgf' self.header = mgf.read_header(self.path) self.spectra = list(mgf.read(self.path)) self.tmpfile = tempfile.TemporaryFile(mode='r+') mgf.write(header=self.header, spectra=self.spectra, output=self.tmpfile) self.tmpfile.seek(0) self.header2 = mgf.read_header(self.tmpfile) self.tmpfile.seek(0) tmpreader = mgf.read(self.tmpfile) self.spectra2 = list(tmpreader) self.ns = len(self.spectra) self.tmpfile.close()
def get_cluster_spectra(mgf_filename: str) -> Dict[str, sus.MsmsSpectrum]: """ Read all spectra from the given MGF file corresponding to a single cluster. Parameters ---------- mgf_filename : str The file name of the MGF file to be read. Returns ------- Dict[str, sus.MsmsSpectrum] A dictionary with as keys the scan numbers and as values the corresponding spectra. """ spectra = {} for spectrum_dict in mgf.read(mgf_filename): # TODO: Make sure the USI doesn't contain a peptide identification. cluster, usi = spectrum_dict['params']['title'].split(';') spectrum = sus.MsmsSpectrum( usi, spectrum_dict['params']['pepmass'][0], spectrum_dict['params']['charge'][0], spectrum_dict['m/z array'], spectrum_dict['intensity array'], retention_time=spectrum_dict['params']['rtinseconds']) spectrum.cluster = cluster if usi in spectra: raise ValueError(f'Non-unique USI: {usi}') spectra[usi] = spectrum return spectra
def patchMgf(self, input_path, output_path): maxWindowDiff = 156.10112 + 2 * 1.00782503 + 15.9949146 with mgf.read(input_path) as spectra: spectra_out = [] for spectrum in spectra: int_dic = spectrum['intensity array'] mz_dic = spectrum['m/z array'] param_dic = spectrum['params'] chrg_spec = spectrum['params']['charge'][0] pos = 0 del_array = [] for m in mz_dic: if m < 175: # smallest y ion - arginin del_array.append(pos) elif m > spectrum['params']['pepmass'][0] * chrg_spec - ( chrg_spec - 1) * 1.00782503 - maxWindowDiff: del_array.append(pos) pos += 1 int_dic = np.delete(int_dic, del_array, 0) mz_dic = np.delete(mz_dic, del_array, 0) spectra_out.append({ 'm/z array': mz_dic, 'intensity array': int_dic, 'params': param_dic }) mgf.write(spectra=spectra_out, output=output_path)
def process(mgf_path, output_file): identifications = dict() imported_n = 0 for file in os.listdir(mgf_path): if not file.lower().endswith('.mgf'): continue imported_n += 1 for spectrum in mgf.read(mgf_path + "/" + file): params = spectrum.get('params') title = params.get('title') seq = params.get('seq') if seq == "" or seq == None: # seq = "_UNID_" continue identifications[title] = seq if imported_n < 1: #read from old identification files if os.path.isfile(output_file) and os.path.getsize(output_file) > 1000: with open(output_file, 'r') as o: lines = o.readlines()[1:] #remove the table head line for line in lines: line = line.rstrip() sections = line.split("\t") title = sections[0] seq = sections[1] identifications[title] = seq else: raise Exception("No mgf file found here! %s" % (mgf_path)) write_to_file(identifications, output_file) #phoenix_writer.export_ident_to_phoenix("pxd000021_test", "localhost", identifications) """
def plain_parse(mgf_read_path, mgf_txt_write_path): this_dir = os.path.dirname(os.path.realpath(__file__)) if os.path.isfile(mgf_txt_write_path): return "mgf_txt_write_path is already a file" utility.print_timestamp("Plain Parse MGF - Start - " + basename(mgf_read_path)) with open(mgf_txt_write_path, 'w') as mgf_csv: with mgf.read(mgf_read_path) as mgf_reader: csv_writer = csv.writer(mgf_csv, delimiter='\t') csv_writer.writerow( ['filename', 'scan', 'charge', 'rt', 'ms1 intensity']) for spectrum in mgf_reader: scans = spectrum['params']['scans'] charge = re.sub(r'[^\d.]+', '', str(spectrum['params']['charge'])) rt = spectrum['params']['rtinseconds'] ms1_intensity = spectrum['params']['pepmass'][1] csv_writer.writerow([ os.path.basename(mgf_read_path), scans, charge, rt, ms1_intensity ]) utility.print_timestamp("Plain Parse MGF - Complete - " + basename(mgf_read_path))
def extractAndAppend(filePath,n,instrum): intensity=[];MZ=[];charge=[];title=[];pepmass=[]; print("\n█████████████████████████████开始读入"+filePath+"的信息█████████████████████████████\n") for spectrum in mgf.read(filePath): #print ("\n\n Spectrum \n\n\n",spectrum) params = spectrum.get('params') MZ.append(spectrum.get('m/z array')) intensity.append(spectrum.get("intensity array")) charge.append(params.get('charge')) title.append(params.get('title')) pepmass.append(params.get('pepmass')) print("\n█████████████████████████████读入"+filePath+"信息完毕█████████████████████████████\n") resultList=random.sample(range(0,len(charge)),int(len(charge)*0.2)) index=sorted(resultList) #print(index) """write""" for i in range(0,len(index)): Mz=list(MZ[index[i]]);Intensity=list(intensity[index[i]]); #print(Mz) PepMass=list(pepmass[index[i]]) with open("D"+str(n)+".mgf","a") as f1: f1.write("BEGIN IONS\n") f1.write("TITLE="+title[index[i]]+"-MGF-instrumentation="+instrum+"\n") f1.write("PEPMASS="+str(PepMass[0])+"\n") f1.write("CHARGE="+str(charge[index[i]])+"\n") for i1 in range(len(Mz)): f1.write(str(Mz[i1])+" "+str(Intensity[i1])+"\n") f1.write("END IONS\n")
def convert_mq_mracluster_mgf(mq_msms, mrcluster_clusters, mgf_file, output, px_accession, raw_name): if mq_msms is None or mrcluster_clusters is None or mgf_file is None: print_help() # Read the input spectra input_spectra = mgf.read(mgf_file) spectra_list = list(input_spectra) print('Number of Spectra: ' + str(len(spectra_list))) # Read the msms.txt files using, for now the peptides will be a dictionary, where the key is the scan number # and the values is the peptide sequence. We need to be aware that we can have cases when one scan can be associated with more # than one peptide sequence peptides = read_peptides(mq_msms) print('Number of Peptides: ' + str(len(peptides))) # Read clusters, the clusters will be a map where the key is the scan and the value is the cluster where the scan belongs clusters = read_clusters(mrcluster_clusters) print("Number of Clusters: " + str(len(clusters))) for scan in clusters: print('scan: ' + str(scan)) for spectra in spectra_list: if spectra['params']['title'].endswith('scan=' + str(scan)): cluster_accession = clusters[scan] if scan not in peptides: peptide_sequence = None else: peptide_sequence = peptides[scan] charge = int(spectra['params']['charge'][0]) spectra['params']['title'] = buid_usi_accession( cluster_accession, peptide_sequence, scan, px_accession, raw_name, charge) mgf.write([spectra], output)
def read_mgf(path): """ returns (scan ID, time, charge, mz, mass estimate) """ with mgf.read(path) as reader: for i in reader: scan = int(re.match(".* scan=([0-9]+)", i["params"]["title"])[1]) time = i["params"]["rtinseconds"] chargelist = i["params"]["charge"] if len(chargelist) > 1: raise AssertionError("ChargeList length>1 unsupported") charge = int(chargelist[0]) peptide_mz = i["params"]["pepmass"][0] peptide_intensity = i["params"]["pepmass"][1] peptide_mass_estimate = peptide_mz * charge - charge * PROTON_MASS fragments_mz = i["m/z array"] fragments_intensity = i["intensity array"] yield PeptideMeasurement( scan, time, charge, peptide_mz, peptide_intensity, peptide_mass_estimate, fragments_mz, fragments_intensity, )
def create_ppk_matrix_stripe_serial(filter_func, shift, normalise, output_name): iokr_data_path = '/home/grimur/iokr/data' data_gnps = scipy.io.loadmat("/home/grimur/iokr/data/data_GNPS.mat") ms_path = '/home/grimur/iokr/data/SPEC' candidate_set = '/home/grimur/iokr/data/mibig/matched_mibig_gnps_2.0.mgf' candidate_set_size = 257 iokrdata = data.IOKRDataServer(iokr_data_path) ker_size = len(iokrdata.spectra) kernel_matrix_peaks = numpy.zeros((candidate_set_size, ker_size)) kernel_matrix_nloss = numpy.zeros_like(kernel_matrix_peaks) t0 = time.time() names = [x[0] for x in iokrdata.spectra] cnt = 0 for i in mgf.read(candidate_set): i_ms = MSSpectrum(i) # active_jobs.append((i, p.apply_async(do_stripe, (i, names)))) res = do_stripe(i_ms, names, filter_func, shift, normalise) for j_idx, values in enumerate(res): ij_peaks, ij_nloss = values kernel_matrix_peaks[cnt, j_idx] = ij_peaks kernel_matrix_nloss[cnt, j_idx] = ij_nloss cnt += 1 print('done %s / %s, %s' % (cnt, candidate_set_size, time.time() - t0)) numpy.save(output_name + '_test_peaks.npy', kernel_matrix_peaks) numpy.save(output_name + '_test_nloss.npy', kernel_matrix_nloss)
def annotate_mgf(mgf_input: str, mascot_input: str, mgf_output: str): """Annotate MGF file using Mascot XML results. annotate_mgf will annotate the MGF file using peptide sequences found in the Mascot XML results and write the resulting MGF file to mgf_output. Args: mgf_input (str): path to the MGF input file. mascot_intput (str): path to the Mascot XML results. mgf_output (str): path to the MGF output file. """ # Retrieve mascot sequences. mascot_seq = extract_mascot_sequences(mascot_input) with mgf.read(mgf_input, read_charges=False) as reader: for spectrum in reader: sequences = mascot_seq.loc[ mascot_seq.title == spectrum['params']['title'], 'sequence' ].values # If multiple sequences are associated to a single spectrum, the # latter will be duplicated for each sequence. for seq in sequences: spectrum['params']['seq'] = seq mgf.write((spectrum,), mgf_output)
def format_mgf_deepnovo(mgf_input: str, mgf_output: str): """Format MGF file for use with DeepNovoV2. Necessary spectrum parameters will be reordered to comply with DeepNovoV2 convention. Other parameters will be discarded. Empty spectra will be discarded. Args: mgf_input (str): path to the input MGF file. mgf_output (str): path to the output MGF file. """ key_order = ['title', 'pepmass', 'charge', 'scans', 'rtinseconds'] with mgf.read(mgf_input, read_charges=False) as reader: for spectrum in reader: # Check if spectrum isn't emtpy. if spectrum['m/z array'].size: # Remove unnecessary parameters. to_remove = [c for c in spectrum['params'].keys() if c not in key_order] for col in to_remove: spectrum['params'].pop(col) # Append current spectrum to MGF output with correct params # order. mgf.write((spectrum,), mgf_output, key_order=key_order)
def read_mgf(filename): """ Read all spectra from the given mgf file. Args: filename: The mgf filename from which to read the spectra. Returns: A tuple of a `Spectrum` (containing the spectrum's information), an array of masses, and an array of intensities. """ # test if the given file is an mzML file verify_extension(['.mgf'], filename) # get all query spectra for mgf_spectrum in mgf.read(filename): # create query spectrum identifier = mgf_spectrum['params']['title'] precursor_mz = float(mgf_spectrum['params']['pepmass'][0]) retention_time = float(mgf_spectrum['params']['rtinseconds']) if 'charge' in mgf_spectrum['params']: precursor_charge = int(mgf_spectrum['params']['charge'][0]) else: precursor_charge = None read_spectrum = spectrum.Spectrum(identifier, precursor_mz, precursor_charge, retention_time) read_spectrum.set_peaks(mgf_spectrum['m/z array'], mgf_spectrum['intensity array']) yield read_spectrum
def binSpectra(fi, binfunc, binparams): # this function reads an mgf file # and assigns bins to the spectra # according to the given bin-function and bin-parameters bins = {} with mgf.read(fi) as reader: for spectrum in reader: pmass = spectrum['params']['pepmass'][0] # here the bin-function is called bin_ = binfunc(spectrum, binparams) if bin_ not in bins: # raw_intensities, raw_masses, norm_masses bins[bin_] = [[], 0, []] bins[bin_][0].extend(spectrum['intensity array']) # to save space we only store the normalised masses # bins[bin_][1].extend(spectrum['m/z array']) bins[bin_][1] += 1 bins[bin_][2].extend(spectrum['m/z array'] - pmass) # sort the peaks in each bin by mass, then intensity for bin_ in bins: nSpectra = bins[bin_][1] bins[bin_] = list(reversed(map(list, zip(*sorted(zip(bins[bin_][2], bins[bin_][0])))))) bins[bin_].insert(1, nSpectra) return bins
def getMassHistogram(fi, binsize=50): masses = [] with mgf.read(fi) as reader: for spectrum in reader: pmass = spectrum['params']['pepmass'][0] masses.extend(spectrum['m/z array'] - pmass) # the histogram of the data with histtype='step' fig = plt.figure() plot = fig.add_subplot(111) n, bins, patches = plot.hist(masses, binsize, normed=1, histtype='stepfilled') plt.setp(patches, 'facecolor', 'b', 'alpha', 0.75) # add a line showing the expected distribution y = matplotlib.mlab.normpdf(bins, np.mean(masses), np.std(masses)) l = plot.plot(bins, y, 'r--', linewidth=1.5) fig.tight_layout() fig.savefig(os.path.basename(fi) + '.massHist1.png', dpi=300) fig.savefig(os.path.basename(fi) + '.massHist1.svg', dpi=300) plt.close(fig) fig = plt.figure() plot = fig.add_subplot(111) n, bins, patches = plot.hist(masses, binsize, normed=1, histtype='bar') # add a line showing the expected distribution y = matplotlib.mlab.normpdf(bins, np.mean(masses), np.std(masses)) l = plot.plot(bins, y, 'r--', linewidth=1.5) fig.tight_layout() fig.savefig(os.path.basename(fi) + '.massHist2.png', dpi=300) fig.savefig(os.path.basename(fi) + '.massHist2.svg', dpi=300) plt.close(fig) pass
def convert(args, out=sys.stdout): """Remaps the nodes from a network analysis (sorted by pepmass) to the same order as the spectral library.""" with mgf.read(args.lib_mgf) as reader: spectra = list( sorted((sp['params']['pepmass'][0], oidx) for oidx, sp in enumerate(reader))) node_fmt = "{0}l{1:010d}".format with open(args.edges_tsv) as f: for line in f: if line.startswith('#'): out.write(line) continue tokens = line.split('\t') s_mz, s_idx = tokens[0].split('-') s_omz, s_oidx = spectra[int(s_idx)] check_mz(s_mz, s_omz, tokens[0], s_oidx) t_mz, t_idx = tokens[1].split('-') t_omz, t_oidx = spectra[int(t_idx)] check_mz(t_mz, t_omz, tokens[1], t_oidx) out.write('\t'.join([ node_fmt(s_mz, s_oidx), node_fmt(t_mz, t_oidx), 'll', ] + tokens[3:]))
def write_to_csv(projectid, mgf_file, data_type): filename = os.path.basename(mgf_file) spec_file_name = mgf_file[:-4] + "_spec.csv" spec_file = open(spec_file_name, "w") spec_writer = csv.writer(spec_file, lineterminator='\n') spec_writer.writerow([ 'spectrumTitle', 'precursorMz', 'precursorIntens', 'charge', 'peaklistMz', 'peaklistIntens' ]) if data_type == "peak_psm": psm_file_name = mgf_file[:-4] + "_psm.csv" psm_file = open(psm_file_name, "w") psm_writer = csv.writer(psm_file) psm_writer.writerow(['spectrumTitle', 'sequence', 'modifications']) spectra_list = mgf.read(mgf_file) print("Handling the data in %s" % (mgf_file)) for index, spectrum in enumerate(spectra_list, start=1): # default is zero (spec_row, psm_row) = get_row(projectid, filename, index, spectrum, data_type) spec_writer.writerow(spec_row) if data_type == "peak_psm": psm_writer.writerow(psm_row) print("The data had been wrote in the csv file.")
def reader(path, flag): intensity = [] MZ = [] charge = [] title = [] pepmass = [] print("\n█████████████████████████████开始读入" + path + "的信息█████████████████████████████\n") for spectrum in mgf.read(path): #print ("\n\n Spectrum \n\n\n",spectrum) params = spectrum.get('params') MZ.append(spectrum.get('m/z array')) intensity.append(spectrum.get("intensity array")) charge.append(params.get('charge')) title.append(params.get('title')) pepmass.append(params.get('pepmass')) print("\n█████████████████████████████读入" + path + "信息完毕█████████████████████████████\n") #resultList=random.sample(range(0,len(charge)),int(len(charge)*0.2)) #index=sorted(resultList) if flag == 1: labels = [1 for i in range(0, len(handle(title)))] else: labels = [2 for i in range(0, len(handle(title)))] return handle(title), labels
def mgf_library_upload(fileName): libMGF = mgf.read(fileName) smf.print_milestone('Enter library dictionary upload: ') lib = {} id = 0 for spec in libMGF: id += 1 key = (spec['params']['pepmass'][0], spec['params']['seq']) charge = int(re.sub('[+-]', '', str(spec['params']['charge'][0]))) name = spec['params']['title'] if 'protein' in spec['params']: protein = spec['params']['protein'] else: protein = '' if 'DECOY' in name: decoy = 1 else: decoy = 0 mz = spec['m/z array'] intensity = spec['intensity array'] intensity = [x**0.5 for x in intensity] keyList = [id for x in mz] peaks = list(tuple(zip(mz, intensity, keyList))) peaks.sort(key=lambda x: x[1], reverse=True) if len(peaks) > 10: peaks = peaks[:10] peaks.sort(key=lambda x: x[0]) tempDict = { 'PrecursorCharge': charge, 'transition_group_id': name, 'ProteinName': protein, 'Peaks': peaks, 'ID': id, 'Decoy': decoy, } lib[key] = tempDict return lib
def merge_mgf_files(ms2_file, ms3_file, mz_cutoff): ms2_count = 0 ms3_count = 0 current_count = 0 merged_count = 0 #preloading the files into memory #ms2 - so we have a total spectra count for the progress bar #ms3 - so we don't have to read in repeatedly per ms2 spectra merged_mgf = [] ms2_spectrum_list = [] ms3_spectrum_list = [] print("Reading MS2 file: " + ms2_file) with mgf.read(ms2_file) as ms2_reader: for ms2_temp in ms2_reader: ms2_spectrum_list.append(ms2_temp) ms2_count += 1 print("Reading MS3 file: " + ms3_file) with mgf.read(ms3_file) as ms3_reader: for ms3_temp in ms3_reader: ms3_spectrum_list.append(ms3_temp) ms3_count += 1 #Loop through all MS2/MS3 spectra looking for fuzzy matches. for ms2_spectrum in ms2_spectrum_list: for ms3_index, ms3_spectrum in enumerate(ms3_spectrum_list): if compare_spectrums_with_fuzzy_rt(ms2_spectrum, ms3_spectrum): merged_xy = merge_xy_arrays(ms2_spectrum, ms3_spectrum, mz_cutoff) ms2_spectrum['m/z array'] = merged_xy[0] ms2_spectrum['intensity array'] = merged_xy[1] merged_count += 1 #remove the element we just found from the list to avoid dupes and save time del ms3_spectrum_list[ms3_index] break merged_mgf.append(ms2_spectrum) #add no matter if it was merged or not current_count += 1 write_progress_bar(current_count, ms2_count) return { "merged_mgf": merged_mgf, "ms2_count": ms2_count, "ms3_count": ms3_count, "merged_count": merged_count }
def readmgf(fn): file = open(fn, "r") data = mgf.read(file, convert_arrays=1, read_charges=False, dtype='float32', use_index=False) codes = parse_spectra(data) file.close() return codes
class TestMgf: data = pd.DataFrame(mgf.read("../data/raw/example.mgf")) def test_one(self): assert self.data.count()[0] == 5 def test_two(self): assert self.data.iloc[0]['params']['title'] == 'scan=986 profile data'
def read_mgf(file): spectra = mgf.read(file) spectrum = next(spectra) #create parameters: mz = m/z array, para = parameters, amp = intensity array mz = spectrum['m/z array'] params = spectrum['params'] amp = spectrum['intensity array'] return (mz, params, amp)
def load_file(self, mgf_path, csv_path): print('Start to load file data...') info = pd.read_csv(csv_path, header=None) self.spectrum1 = info[0].tolist() self.spectrum2 = info[1].tolist() self.label = info[2].tolist() for mgf in read(mgf_path, convert_arrays=1): self.MGF[mgf.get('params').get('title')] = mgf print('Finish to load data...')
def extractSpectraFast(x): rows = 0 with mgf.read(x) as spectra: for spectrum in spectra: rows = rows + len(spectrum['m/z array']) # first argument is rownumber, sec argument is columnnumber n = np.empty((rows, 2)) with mgf.read(x) as spectra: i = 0 for spectrum in spectra: m = spectrum['m/z array'] intensity = spectrum['intensity array'] for m, intensity in zip(spectrum['m/z array'], spectrum['intensity array']): n[i, 0] = m n[i, 1] = intensity i = i + 1 return np
def extractSpectraFastForR(x): i = 1 with mgf.read(x) as spectra, open("/home/tobiass/df.csv", "wt") as csvfile: writr = csv.writer(csvfile) for spectrum in spectra: for m, intensity in zip(spectrum['m/z array'], spectrum['intensity array']): writr.writerow((m, intensity, i)) i = i + 1
def mgf_library_upload_quant(fileName, scanDict, digDict, aaDict, maxPeaks): # mgf file is read in using the pyteomics mgf module libMGF = mgf.read(fileName) # return value is initialized lib = defaultdict(list) keyList = sorted(list(scanDict.keys())) # each spectrum in the mgf file for spec in libMGF: seq = spec['params']['seq'] precMz = spec['params']['pepmass'][0] key = (round(precMz, 2), seq) if key not in scanDict: continue # Decimal values are replaced with numeric placeholders to be included in the analysis. sequence = re.sub(r'\+\d+\.\d+', lambda m: digDict.get(m.group()), seq) # peaks of the library file are intialized mz = list(spec['m/z array']) intensity = [x for x in list(spec['intensity array'])] z = spec['params']['charge'][0] # The y-ion mz value for each fragment of the peptide is calculated. If it is in the library, it and it's intensity are stored in a list # NOTE: y-ions are singled out because they should have at least one lysine or arginine, so will have a heavy counterpart that can show up. B-ions don't have that guarantee. fragList = [] for x in range(1, len(sequence) - 1): fragseq = sequence[x:] lightfragmz = mass.fast_mass( sequence=sequence[x:], ion_type='y', charge=1, aa_mass=aaDict) # Do I need to use different possible charges? i = smf.approx_list(lightfragmz, mz) if i == -1: continue fragList.append((intensity[i], lightfragmz, fragseq)) # y-ion peaks are sorted by intensity, and lower-intensity peaks are filtered out. fragList.sort(reverse=True) if maxPeaks != 0 and len(fragList) >= maxPeaks: fragList = fragList[:maxPeaks] # heavy counterpart mz is calculated. Light and heavy pairs are additionally tagged by their intensity rank and included in the final output. peaks = [] for i in range(len(fragList)): fragMz = fragList[i][1] fragInt = fragList[i][0] peaks.append((fragMz, fragInt, (0, i, seq))) peaks.append((smf.calculate_heavy_mz(fragList[i][2], fragMz, 1), fragInt, (1, i, seq))) peaks.sort(key=lambda x: x[0]) lib[scanDict[key]] += peaks return lib
def plotHeatmapOld(fi, limit=1000): # X, Y, Z = [], [], [] grid_d = {} xmin, xmax, ymin, ymax = None, None, None, None processedSpectra = 0 with mgf.read(fi) as reader: for spectrum in reader: if processedSpectra > limit: break processedSpectra += 1 x = int(spectrum['params']['pepmass'][0] * 100000) xmin, xmax = (min(x, xmin), max(x, xmax)) if xmin is not None else (x, x) for y, z in it.izip(spectrum['m/z array'], spectrum['intensity array']): y = int(y + 0.5) ymin, ymax = (min(y, ymin), max(y, ymax)) if ymin is not None else (y, y) grid_d[(x,y)] = z # X.extend([spectrum['params']['pepmass'][0] for i in spectrum['intensity array']]) # Y.extend(spectrum['m/z array']) # Z.append(spectrum['intensity array']) # Z.extend(spectrum['intensity array']) grid = [] for x in xrange(xmin, xmax + 1): for y in xrange(ymin, ymax + 1): if (x, y) in grid_d: grid.append(grid_d[(x,y)]) del grid_d[(x,y)] else: grid.append(0.0) nrows, ncols = xmax - xmin + 1, ymax - ymin + 1 grid2 = np.array(grid).reshape((nrows, ncols)) grid = grid2 #fig = plt.figure() #plot = fig.add_subplot(111) plt.imshow(grid, extent=(xmin, xmax, ymin, ymax), interpolation='nearest', cmap=matplotlib.cm.seismic) # X = np.array(X) # Y = np.array(Y) # Z = np.matrix(Z) #range_ = min(Z), max(Z) #Z = matplotlib.cm.rainbow(map(lambda x:x/range_[1], Z)) #plot.scatter(X, Y, color=Z) # pcolormesh(X, Y, Z) plt.tight_layout() plt.savefig(os.path.basename(fi) + '.colormap.png', dpi=300) plt.savefig(os.path.basename(fi) + '.colormap.svg', dpi=300) plt.close()
def load_recalibrate(self): fc = calculate_Delta_by_ppm(self.ppm) tmt_mass = calculate_tag_tmt10() with mgf.read(self.path) as spectra: for spectrum in spectra: ms = MasterSpectrum() params = spectrum['params'] for mass, intensity in zip(spectrum['m/z array'], spectrum['intensity array']): ms.add(Peak(mass, intensity, fc)) peak = Peak(tmt_mass, 0.5, fc) if peak.key() not in ms.spectrum[0]: recalibrate = False else: idx, bin_to_ack, a, b = ms.binary( peak, 0, len(ms.spectrum[0][peak.key()]) - 1, 0) if idx == -1: recalibrate = False else: recalibrate = True recalibration_mass = ms.spectrum[0][peak.key()][idx].mz diff = tmt_mass - recalibration_mass print(params['title']) print("original={0}\tdiff={1}".format( recalibration_mass, diff)) mass_list = [] int_list = [] if recalibrate: ppm_shift = calculate_ppm_shift(diff, tmt_mass) for key in ms.spectrum[0].keys(): for mp in ms.spectrum[0][key]: if recalibrate: if self.type == 'ppm': diff = calculate_da_shift(mp.mz, ppm_shift) mass_list.append(mp.mz + diff) elif self.type == 'absolute': diff = diff mass_list.append(mp.mz + diff) else: print(self.type) raise ValueError("what did you dooooo") else: mass_list.append(mp.mz) int_list.append(mp.intensity) print("len is:\t{0}".format(len(mass_list))) mgf.write(spectra=[{ 'm/z array': mass_list, 'intensity array': int_list, 'params': params }], output=self.file_out)
def calculate_dsmapper_time(spectra_mgf_file1, spectra_mgf_file2): score_list = [] # model = "../SpectraPairsData/080802_20_1000_NM500R_model.pkl" model = "./data/080802_20_1000_NM500R_model.pkl" tmp_time_01 = time.perf_counter() net = torch.load(model) tmp_time_02 = time.perf_counter() print("加载模型用时:{}".format(tmp_time_02 - tmp_time_01)) # 五百个参考的谱图 # reference_spectra = read("./0715_50_rf_spectra.mgf", convert_arrays=1) reference_spectra = read("../SpectraPairsData/0722_500_rf_spectra.mgf", convert_arrays=1) # reference_spectra = read("./data/0722_500_rf_spectra.mgf", convert_arrays=1) reference_intensity = np.array([ bin_spectrum(r.get('m/z array'), r.get('intensity array')) for r in reference_spectra ]) spectra_pairs_num = more_itertools.ilen( read(spectra_mgf_file1, convert_arrays=1)) tmp_time_03 = time.perf_counter() print("准备相关数据用时:{}".format(tmp_time_03 - tmp_time_02)) embedded_01 = embedding_dataset(net, spectra_mgf_file1, reference_intensity, spectra_pairs_num) embedded_02 = embedding_dataset(net, spectra_mgf_file2, reference_intensity, spectra_pairs_num) # embedded_01 = embedded_01.reshape(embedded_01.shape[0], 1, embedded_01.shape[1]) # embedded_02 = embedded_02.reshape(embedded_02.shape[0], 1, embedded_02.shape[1]) time01 = time.perf_counter() print("数据编码加嵌入的总用时:{}".format(time01 - tmp_time_03)) for i in range(embedded_01.shape[0]): score = np.linalg.norm(embedded_01[i] - embedded_02[i]) score_list.append(score) # np.savetxt("./data/091801_test_use_time_dsmapper.txt", score_list) time02 = time.perf_counter() print("calc_EU use time: {}".format(time02 - time01))
def readSpectrum(mgffile, scanindex): msms1 = [] with mgf.read(mgffile) as allspectra: for spectrum in allspectra: n = int(re.sub("\D+", "", spectrum['params']['title'])) if n == scanindex: mz = spectrum['m/z array'] inty = spectrum['intensity array'] for i in range(len(mz)): msms1.append((mz[i], inty[i])) return (pd.DataFrame(msms1, columns=['Mass', 'Intensity']))
def getBinMembers(fi, binfunc, binparams): bins = {} with mgf.read(fi) as reader: for spectrum in reader: pmass = spectrum['params']['pepmass'][0] bin_ = binfunc(spectrum, binparams) if bin_ not in bins: # raw_intensities, raw_masses, norm_masses bins[bin_] = [] bins[bin_].append(spectrum['params']['scans']) return bins
def plotDensities(fi, precMassBinSize=100): binf = maxMS2IntensityBinFunc massBins = {} spectrum_bin_map = {} with mgf.read(fi) as reader: spectra = sorted(((spectrum['params']['pepmass'][0], spectrum['params']['title'], binf(spectrum, params={'winsize': precMassBinSize})) for spectrum in reader), key=lambda x:(x[2], x[0])) for i, spectrum in enumerate(spectra): spectrum_bin_map[i] = spectrum[2] grid = n_subplots(len(set(spectrum_bin_map.values()))) f, axes = plt.subplots(grid[0], grid[1], figsize=(16,16), sharey=True) sns.despine(left=True) sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.0}) pal = sns.color_palette("Reds", n_colors=101) with mgf.read(fi) as reader: for i, spectrum in enumerate(reader): massBins[spectrum_bin_map[i]] = massBins.get(spectrum_bin_map[i], []) + [spectrum] row, col = 0, 0 for mbin in sorted(massBins): print row, col, mbin, len(massBins[mbin]) maxIntensity = max(max(spectrum['intensity array']) for spectrum in massBins[mbin]) spectra = sorted(((max(spectrum['intensity array']), spectrum['params']['pepmass'][0], i) for i, spectrum in enumerate(massBins[mbin])), key=lambda x:x[0]) for intensity, precMass, i in spectra: dp = sns.distplot(massBins[mbin][i]['m/z array'] - precMass, hist=False, color=pal[int(intensity/maxIntensity * 100 + 0.5)], ax=axes[row,col]) axes[row, col].set_ylim(0, 0.004) if col == grid[1] - 1: row += 1 col = 0 else: col += 1 plt.tight_layout() plt.savefig("test.png", dpi=300)
def plain_parse(mgf_read_path, mgf_txt_write_path): this_dir = os.path.dirname(os.path.realpath(__file__)) if os.path.isfile(mgf_txt_write_path): return "mgf_txt_write_path is already a file" with open(mgf_txt_write_path,'wb') as mgf_csv: with mgf.read(mgf_read_path) as mgf_reader: csv_writer = csv.writer(mgf_csv, delimiter='\t') csv_writer.writerow(['filename', 'scan', 'charge', 'rt', 'ms1 intensity']) for spectrum in mgf_reader: scans = spectrum['params']['scans'] charge = re.sub(r'[^\d.]+', '', str(spectrum['params']['charge'])) rt = spectrum['params']['rtinseconds'] ms1_intensity = spectrum['params']['pepmass'][1] csv_writer.writerow([os.path.basename(mgf_read_path), scans, charge, rt, ms1_intensity])
device.ylabel("Intensity") device.bar(spectrum["m/z array"], spectrum["intensity array"], width=0.5, linewidth=1, edgecolor=color) return def plotSingleSpectrum(filename, spectrum): pylab.figure() createSpectrumFigure(spectrum, pylab) pylab.savefig(filename) return ## main if len(sys.argv) == 2: ## download file filename, header = urllib.urlretrieve (sys.argv[1]) spectra = mgf.read(filename) ## we only support the first spectrum now spectrum = next(spectra) tmpdir = tempfile.mkdtemp() filename = os.path.join(tmpdir, "output.svg") plotSingleSpectrum(filename, spectrum) ## write to stdout with open(filename, "r") as fin: print(fin.read()) else: print("Usage:\n\t" + sys.argv[0] + " URL")
@author: ilya """ from pyteomics import fasta, mgf, parser import pylab fasta_file = '/home/ilya/src/pyteomics/RhoEcoli.fasta' mgf_file = '/home/ilya/src/pyteomics/MultiConsensus.mgf' peptides = set() with open(fasta_file) as fi: for description, sequence in fasta.read(fi): new_peptides = parser.cleave(sequence, parser.expasy_rules['trypsin']) peptides.update(new_peptides) print "UNIQUE PEPTIDES" print peptides with open(mgf_file) as fi: for spectrum in mgf.read(fi): pylab.figure() pylab.xlabel('m/z, Th') pylab.ylabel('Intensity, rel.units') pylab.bar(spectrum['m/z array'], spectrum['intensity array'], width=0.1, linewidth=2, edgecolor='black') pylab.show() inp = raw_input("Show more?") if inp != "yes": break; print "DONE!"
def plotHeatmap(fi, nSpectra=4000, fragBinsize=10, precursorBinsize=5): binf = maxMS2IntensityBinFunc # filter spectra so that at most 4000 spectra with log10(maxMS2Intensity) == 2 are used # spectra are sorted by log10(maxMS2Intensity), precursor mass with mgf.read(fi) as reader: spectra = sorted([(spectrum['params']['pepmass'][0], spectrum['params']['title'], binf(spectrum)) for spectrum in reader], key=lambda x:(x[2], x[0])) spectra = [spectrum for spectrum in spectra if spectrum[2] == 2][:nSpectra] for spectrum in spectra: print spectrum # build dict spectrum-id => index (might have to be precursor mass) usedSpectra = dict([(spectrum[1], i) for i, spectrum in enumerate(spectra)]) # extract spectrum masses/intensities from file according to usedSpectra list grid_d = {} X,Y = [], [] with mgf.read(fi) as reader: for spectrum in reader: if spectrum['params']['title'] in usedSpectra: # find precursor bin (y-axis) binPrecursor = usedSpectra[spectrum['params']['title']] // precursorBinsize maxIntensity, maxIntensityFM = 0, 0 for fragMass, fragIntensity in it.izip(spectrum['m/z array'], spectrum['intensity array']): # find fragment mass bin (x-axis) for each fragment binFrag = int(fragMass // fragBinsize) if fragIntensity > maxIntensity: maxIntensity = fragIntensity maxIntensityFM = binFrag maxIntensityBP = binPrecursor # key = (binFrag, binPrecursor) key = (binPrecursor, binFrag) # grid_d holds 2D bins (precursor mass, fragment mass) if key not in grid_d: grid_d[key] = [] grid_d[key].append(fragIntensity) for i in xrange(int(maxIntensity + 0.5)): X.append(int(maxIntensityFM + 0.5)) Y.append(int(maxIntensityBP + 0.5)) with sns.axes_style("white"): # f, axes = plt.subplots(1, 1, figsize=(16,16), sharey=True) #fig = plt.figure() #sp = fig.add_subplot(1,1,1) sns.despine(left=True) sns.set_context("notebook", font_scale=1.5, rc={"lines.linewidth": 1.0}) pal = sns.color_palette("Reds", n_colors=101) pl = sns.jointplot(x=np.array(X), y=np.array(Y), kind="kde", color="g") pl.savefig('hex.png', dpi=300) #plt.tight_layout() #plt.savefig("hex.png", dpi=300) del X del Y def mean(L): return sum(L) / len(L) nrows, ncols = 2000 // precursorBinsize, 2000 // fragBinsize for key in sorted(grid_d): print key, grid_d[key] # build colormesh grid - each cell is either 0.0 or the mean intensity grid = [] for x in xrange(ncols): for y in xrange(nrows): if (x, y) in grid_d: grid.append(mean(grid_d[(x,y)])) del grid_d[(x,y)] else: grid.append(0.0) fig = plt.figure() ax = fig.add_subplot(111) ax.pcolormesh(np.array(grid).reshape((nrows, ncols)), cmap=matplotlib.cm.seismic, vmin=min(grid), vmax=max(grid)) """ grid = [] for y in xrange(nrows): row = [] for x in xrange(ncols): if (x, y) in grid_d: row.append(mean(grid_d[(x,y)])) del grid_d[(x,y)] else: row.append(0.0) grid.append(row) grid = grid[::-1] grid = [item for sublist in grid for item in sublist] grid = np.array(grid).reshape((nrows, ncols)) plt.imshow(grid, extent=(0, ncols, 0, nrows), interpolation='nearest', cmap=matplotlib.cm.seismic) """ plt.tight_layout() plt.savefig(os.path.basename(fi) + '.colormap.png', dpi=300) plt.savefig(os.path.basename(fi) + '.colormap.svg', dpi=300) plt.close() pass
def ingest_mgf(input_filename): """Ingest an mgf file given its name and return a dataframe of the file """ with mgf.read('tests/test.mgf') as reader: auxiliary.print_tree(next(reader))