def mzxml_import(file_path): """Read centroided mzXML data""" headers = ["scan", "rt", "mz", "drift", "intensity"] input_data = [] intensity_cutoff = config.intensity_cutoff reader = mzxml.MzXML(file_path) for index, spectrum in enumerate(reader): if len(spectrum["m/z array"]) != len(spectrum["intensity array"]): print( "ERROR: mzXML import; m/z and intensity arrays different lengths" ) if spectrum["msLevel"] == 1: rt = round(spectrum["retentionTime"], 2) for j in range(len(spectrum["m/z array"])): intensity = spectrum["intensity array"][j] mz = spectrum["m/z array"][j] if intensity >= intensity_cutoff: input_data.append([index, rt, mz, None, int(intensity)]) if len(input_data) > 0: mzxml_dataframe = pd.DataFrame.from_records(input_data, columns=headers) print("Completed mzXML import") return mzxml_dataframe else: print("No mass peaks found for " + file_path)
def read_data(file): """ read mzxml file using pyteomics.mzxml """ data = mzxml.MzXML(file) print(str(file), 'has been accepted') return data
def getRT(runs, idTxt): # Input # 1. mzXML files # 2. ID.txt file containing all identified PSMs # Parameter(s) params = {"isolation_window": 1} # isolation window size 1= +/-0.5 # filterLine = re.compile("([0-9.]+)\\@") # Read ID.txt files to extract PSM information print(" Read ID.txt file: to extract PSM information") psms = pd.read_csv( idTxt, skiprows=1, sep=";") # Note that ID.txt file is delimited by semicolon psms = psms[["Peptide", "Outfile", "XCorr"]].drop_duplicates() psms["charge"] = [ outfile.split("/")[-1].split(".")[-2] for outfile in psms["Outfile"] ] psms["key"] = psms["Peptide"] + "_" + psms["charge"] print(" Done ...\n") # RT extraction/assignment for each mzXML file res = [] for run in runs: runName = os.path.basename(run).split(".")[0] # Read a mzXML file and extract PSMs corresponding to the mzXML file reader = mzxml.MzXML(run) ms2ToSurvey = getMs2ToSurvey(reader) subPsms = psms[psms["Outfile"].str.contains(runName)] # Unique key is peptide-charge pair print( " RT of every identified peptide in {} is being inferred and assigned" .format(runName)) keys = subPsms["key"] keys = list(set(keys)) progress = progressBar(len(keys)) for key in keys: progress.increment() rtArray = np.array([]) intArray = np.array([]) for _, psm in subPsms[subPsms["key"] == key].iterrows(): [_, psmScanNum, _, _, _] = os.path.basename(psm["Outfile"]).split(".") psmScanNum = int(psmScanNum) surveyScanNum = ms2ToSurvey[psmScanNum] _, precIntensity, precRt = getPrecursorPeak( reader, int(psmScanNum), surveyScanNum, params) rtArray = np.append(rtArray, precRt) intArray = np.append(intArray, precIntensity) rt = sum(rtArray * intArray) / sum(intArray) # Unit of minute res.append([key, runName, rt, len(rtArray)]) print(" Done ...\n") res = pd.DataFrame(res, columns=["key", "run", "RT", "nPSMs"]) res = reformatRtTable(res) return res
def mzxml_to_pandas_df(filename): slices = [] file = mzxml.MzXML(filename) print("Reading:", filename) while True: try: slices.append(pd.DataFrame(file.next())) except: break df = pd.concat(slices) df_to_numeric(df) df["intensity array"] = df["intensity array"].astype(np.float64) return df
def mzxml_to_pandas_df(filename): ''' Reads mzXML file and returns a pandas.DataFrame. ''' cols = ['retentionTime', 'm/z array', 'intensity array'] slices = [] file = mzxml.MzXML(filename) while True: try: slices.append(pd.DataFrame(file.next())) except: break df = pd.concat(slices)[cols] df_to_numeric(df) return df
def mzxml_to_df(fn): ''' Reads mzXML file and returns a pandas.DataFrame. ''' slices = [] with mzxml.MzXML(fn) as ms_data: while True: try: data = ms_data.next() df = pd.DataFrame(data) # Fix byteorder issue df.loc[:, :] = df.values.byteswap().newbyteorder() df = df[[ 'num', 'msLevel', 'polarity', 'retentionTime', 'm/z array', 'intensity array' ]] slices.append(df) except StopIteration as e: break df = pd.concat(slices) df['retentionTime'] = df['retentionTime'].astype(np.float32) df['m/z array'] = df['m/z array'].astype(np.float32) df['intensity array'] = df['intensity array'].astype(int) df = df.rename( columns={ 'num': 'scan_id', 'msLevel': 'ms_level', 'retentionTime': 'scan_time_min', 'm/z array': 'mz', 'intensity array': 'intensity' }) df = df.reset_index(drop=True) cols = [ 'scan_id', 'ms_level', 'polarity', 'scan_time_min', 'mz', 'intensity' ] df = df[cols] return df
assert (subset_csv.split('.')[-1] == 'csv') except: print("subset csv file is not provided, will not be used") subset_csv = None guide_file_path = '/'.join(guide.split('.')[0].split('/')[:-1]) if guide_file_type == 'txt': print('accessing encycolpedia file:', guide) rt_fit_file = glob.glob(guide_file_path+'./*.rt_fit.txt')[0] print('accessing encycolpedia file rt file:', rt_fit_file) elif guide_file_type == 'xml': print('accessing pepxml file:', pepxmlfilename) print("################################################\n") print('reading mzXML file...') mzxml_it = mzxml.MzXML(mzXMLfilename) print('reading guide file...') if guide_file_type =='txt': encyc_parsed = guide_parsers.parse_encyclopedia(guide_file_path, filename=guide_file_name, q_value=q_cut, IO=PARSERIO) guide_parsed = guide_parsers.parse_rt_fit(guide_file_path, encyc_parsed, filename = rt_fit_file, IO=PARSERIO) elif guide_file_type=='xml': pepxml_it = pepxml.PepXML(pepxmlfilename) elif guide_file_type=='csv': sys.exit('csv not yet implemented') os.makedirs(os.path.dirname('misc/NPULSE.batch'), exist_ok=True) # create NPULSE file t1 = time.time() run_program(mzxml_it, pepxml_it) t2 = time.time() dt = t2 - t1
# It needs to be considered in the script # << ReAdW-based mzXML >> # A mzXML file from ReAdW has the following characteristics # 1. MS3 scan is not always MS2 + 1 scan (it depends on MS instrument, not on MSconvert) # For example, # scan#100: MS2 -> scan#101: MS3 (generally) # scan#1000: MS2, scan#1001: MS2 -> scan#1003:MS3, scan#1004: MS3 (some cases) # 2. "msLevel" of MS2 scan is set to 0 # 3. There's no tag representing precursor m/z in MS2 and MS3 # 4. Precursor m/z can be inferred from "filterLine" tag in MS2 and MS3 # 4. Precursor m/z value is identifcal to .raw file (no re-evaluation/re-calculation) # Inferred relationship between MS2 and MS3 using mzXML file mzxmlFile = "NCI-11plex-1-F1-f10268.mzXML" reader = mzxml.MzXML(mzxmlFile) nTotScans = len(reader) nMS1, nMS2, nMS3 = 0, 0, 0 progress = progressBar(nTotScans) f = open("MS2_MS3_mzXML.txt", "w") with reader: ms2ToMs3 = {} for spec in reader: progress.increment() if spec["msLevel"] == 1: nMS1 += 1 precMzToMs2 = { } # This dictionary is re-initiated for every MS1-scan cycle if spec["msLevel"] == 2: nMS2 += 1 precMz = spec["precursorMz"][0]["precursorMz"]
params["last_scan_extraction"] = "100000" # the last scan used for search params["isolation_window"] = "1" # isolation window size 1= +/-0.5 params["mass_correction"] = "0" # 0 = no correction, 1 = MS1-based params["signal_noise_ratio"] = "0" # fold of the minimum signal noise ratio params[ "max_percentage_RT_range"] = "100" # threshold maximum percentage of the range of retention time of a peak params["min_peak_intensity"] = "10000" # threshold of a peak intensity params["skipping_scans"] = "3" # number of skipping scans during 3D formation params[ "mass_tolerance_peak_matching"] = "3" # mass tolerance for peak matching during 3D formation features, ms1ToFeatures = detectFeatures(mzXML, params) # features = pd.read_pickle("FTLD_Batch2_F50_Features_SN0_Gap3_3ppm.pickle") # ms1ToFeatures = pd.read_pickle("FTLD_Batch2_F50_MS1_to_Features.pickle") # Read mzXML file reader = mzxml.MzXML(mzXML) ms2ToSurvey = getMs2ToSurvey(reader) mzXMLBaseName = os.path.basename(mzXML).split(".")[0] # Read ID.txt files to extract PSM information print(" Read ID.txt file and feature file") psms = pd.read_csv(idTxt, skiprows=1, sep=";") # Note that ID.txt file is delimited by semicolon psms = psms[["Peptide", "Outfile", "measuredMH", "XCorr"]] psms = psms.loc[psms["Outfile"].str.contains( mzXMLBaseName)] # Extract PSMs from FTLD_Batch2_F50.mzXML psms["precMz"] = np.nan psms["charge"] = np.nan psms["featureIndex"] = np.nan psms["category"] = "" psms = psms.drop_duplicates()
loaded_model_json = json_file.read() json_file.close() loaded_model = model_from_json(loaded_model_json) loaded_model.load_weights("SteroidXtract_model.h5") loaded_model.compile(optimizer='adam', loss='binary_crossentropy') os.chdir(input_dir) files = [f for f in os.listdir(input_dir) if f.endswith('.mzXML')] for l in range(len(files)): print('New file loaded') os.chdir(input_dir) # read mzxml file mzxml_file = files[l] print(files[l]) file = mzxml.MzXML(mzxml_file) # dict feature_df = pd.DataFrame( np.nan, index=range(len(file)), columns=['mzxml_index', 'precursor_MZ', 'rt', 'precursor_intensity']) # fill in precursorMZ and RT information h = 0 for i in range(len(file)): if (file[i]['msLevel'] != 2): continue # only MS2 recorded if (file[i]['retentionTime'] > rt_threshold): continue feature_df.iloc[h, 0] = int(file[i]['num']) feature_df.iloc[h, 1] = float(file[i]['precursorMz'][0]['precursorMz']) feature_df.iloc[h, 2] = float(file[i]['retentionTime']) feature_df.iloc[h, 3] = int( file[i]['precursorMz'][0]['precursorIntensity'])
def ms2ForFeatures(full, mzxmlFiles, paramFile): print(" Identification of MS2 spectra for the features") print(" ==============================================") logging.info(" Identification of MS2 spectra for the features") logging.info(" ==============================================") full = full.to_records( index=False ) # Change pd.DataFrame to np.RecArray for internal computation (speed issue) ###################################### # Load parameters and initialization # ###################################### params = utils.getParams(paramFile) # ppiThreshold = "max" # Hard-coded ppiThreshold = params["ppi_threshold_of_features"] pctTfThreshold = float(params["max_percentage_RT_range"]) tolIsolation = float(params["isolation_window"]) tolPrecursor = float(params["tol_precursor"]) tolIntraMS2Consolidation = float(params["tol_intra_ms2_consolidation"]) tolInterMS2Consolidation = float(params["tol_inter_ms2_consolidation"]) nFeatures = len(full) nFiles = len(mzxmlFiles) featureToScan = np.empty((nFeatures, nFiles), dtype=object) featureToSpec = np.empty((nFeatures, nFiles), dtype=object) ################################################# # Assignment of MS2 spectra to features # # Consolidation of MS2 spectra for each feature # ################################################# m = -1 # Index for input files for file in mzxmlFiles: m += 1 reader = mzxml.MzXML(file) fileBasename, _ = os.path.splitext(os.path.basename(file)) colNames = [ item for item in full.dtype.names if item.startswith(fileBasename + "_") ] subset = full[colNames] subset.dtype.names = [s.split("_")[-1] for s in subset.dtype.names] ms2Dict = {} minScan, maxScan = int(np.nanmin(subset["minMS1"])), int( np.nanmax(subset["maxMS1"])) progress = utils.progressBar(maxScan - minScan + 1) print(" %s is being processed" % os.path.basename(file)) print(" Looking for MS2 scan(s) responsible for each feature") logging.info(" %s is being processed" % os.path.basename(file)) logging.info(" Looking for MS2 scan(s) responsible for each feature") for i in range(minScan, maxScan + 1): progress.increment() spec = reader[str(i)] msLevel = spec["msLevel"] if msLevel == 1: surveyNum = i elif msLevel == 2: # Find MS2 scans which satisfy the following conditions # From the discussion around June 2020, # 1. In ReAdW-derived mzXML files, precursor m/z values are in two tags: "precursorMz" and "filterLine" # 2. Through Haiyan's manual inspection, the real precursor m/z value is closer to one in "filterLine" tag # 3. So, in this script, precursor m/z of MS2 scan is obtained from "filterLine" tag # 4. Note that it may be specific to ReAdW-derived mzXML files since MSConvert-derived mzXML files do not have "filterLine" tag # 4.1. In this case, maybe the use of mzML (instead of mzXML) would be a solution (to-do later) # precMz = spec["precursorMz"][0]["precursorMz"] # Precursor m/z from "precursorMz" tag p = re.search("([0-9.]+)\\@", spec["filterLine"]) precMz = float(p.group(1)) survey = reader[str(surveyNum)] fInd = np.where((surveyNum >= subset["minMS1"]) & (surveyNum <= subset["maxMS1"]) & (subset["mz"] >= (precMz - tolIsolation)) & (subset["mz"] <= (precMz + tolIsolation)) & (subset["PercentageTF"] <= pctTfThreshold))[0] if len(fInd) > 0: ppi = [] for i in range(len(fInd)): mz = subset["mz"][fInd[i]] lL = mz - mz * tolPrecursor / 1e6 uL = mz + mz * tolPrecursor / 1e6 ind = np.where((survey["m/z array"] >= lL) & (survey["m/z array"] <= uL))[0] if len(ind) > 0: ppi.append(np.max(survey["intensity array"][ind])) else: ppi.append(0) if sum(ppi) == 0: continue ppi = ppi / np.sum( ppi) * 100 # Convert intensities to percentage values if ppiThreshold == "max": fInd = np.array([fInd[np.argmax(ppi)]]) else: # ppiThreshold should be a numeric value ppiThreshold = float(ppiThreshold) fInd = fInd[np.where(ppi > ppiThreshold)] if len(fInd ) == 0: # Last check of candidate feature indexes continue else: # Add this MS2 scan information to ms2Dict ms2Dict[spec["num"]] = {} ms2Dict[spec["num"]]["mz"] = spec["m/z array"] ms2Dict[ spec["num"]]["intensity"] = spec["intensity array"] # Mapping between features and MS2 scan numbers for i in range(len(fInd)): if featureToScan[fInd[i], m] is None: featureToScan[fInd[i], m] = spec["num"] else: featureToScan[fInd[i], m] += ";" + spec["num"] print( " Merging MS2 spectra for each feature within a run (it may take a while)" ) logging.info( " Merging MS2 spectra for each feature within a run (it may take a while)" ) progress = utils.progressBar(nFeatures) for i in range(nFeatures): progress.increment() if featureToScan[i, m] is not None: spec = intraConsolidation(ms2Dict, featureToScan[i, m], tolIntraMS2Consolidation) featureToSpec[i, m] = spec print( " Merging MS2 spectra for each feature between runs when there are multiple runs" ) print( " Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks" ) logging.info( " Merging MS2 spectra for each feature between runs when there are multiple runs" ) logging.info( " Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks" ) specArray = np.array([]) progress = utils.progressBar(nFeatures) for i in range(nFeatures): progress.increment() if np.sum(featureToSpec[i] == None) == nFiles: specArray = np.append(specArray, None) else: spec = interConsolidation(featureToSpec[i, :], tolInterMS2Consolidation) specArray = np.append(specArray, spec) ############################### # MS2 processing for features # ############################### # "specArray" is the list of (consolidated) MS2 spectra # specArray[i] is the MS2 spectrum corresponding to the i-th feature # If there's no MS2 spectrum, then specArray[i] is None df = utils.summarizeFeatures(full, params) # Add the mean m/z of feature and its charge state to the beginning of MS2 spectrum (similar to .dta file) for i in range(nFeatures): if specArray[i] is not None: specArray[i]["mz"] = np.insert(specArray[i]["mz"], 0, df["feature_m/z"].iloc[i]) specArray[i]["intensity"] = np.insert(specArray[i]["intensity"], 0, df["feature_z"].iloc[i]) df["MS2"] = specArray df = df.sort_values( by="feature_m/z", ignore_index=True) # Features are sorted by "feature_m/z" df.insert(loc=0, column="feature_num", value=df.index + 1) # df["feature_num"] = df.index + 1 # Update "feature_num" according to the ascending order of "feature_m/z" (as sorted) # Write MS2 spectra to files filePath = os.path.join(os.getcwd(), "align_" + params["output_name"]) ms2Path = os.path.join(filePath, "MS2") if not os.path.exists(ms2Path): os.mkdir(ms2Path) for i in range(df.shape[0]): if df["MS2"].iloc[i] is not None: fileName = os.path.join(ms2Path, "f" + str(i + 1) + ".MS2") dfMS2 = pd.DataFrame.from_dict(df["MS2"].iloc[i]) dfMS2.to_csv(fileName, index=False, header=False, sep="\t") # Save fully-aligned features with their MS2 spectra (i.e. res) for debugging purpose # When the pipeline gets mature, this part needs to be removed pickle.dump(df, open(os.path.join(filePath, ".fully_aligned_feature.pickle"), "wb")) # Make the file be hidden ########################## # Handling mzXML file(s) # ########################## # Move mzXML files to the directory(ies) where individual .feature files are located if params["skip_feature_detection"] == "0": for file in mzxmlFiles: baseFilename = os.path.basename(file) featureDirectory = os.path.join(os.getcwd(), os.path.splitext(baseFilename)[0]) os.rename(file, os.path.join(featureDirectory, baseFilename)) return df, featureToScan
# mzxmlFile = "NCI-11plex-1-F1-f10268.mzXML" paramFile = sys.argv[1] params = getParams(paramFile) idTxt = params["idtxt"] print(" Loading ID.txt file") psms, pep2psm, prot2psm, jumpfPath = parseIdtxt(idTxt, params) #################################################### # Extract TMT reporter ion intensities - 1st round # #################################################### print(" Extraction of TMT reporter ion intensities") ms2ToMs3 = {} qdict = { } # Dictionary; key = MS2 scan number, value = TMT reporter intensities (array) for frac in sorted(psms.keys()): reader = mzxml.MzXML(frac) print(" Processing %s" % os.path.basename(frac)) print(" Looking for MS2 precursor scans of MS3 scans") ms2ToMs3[frac] = matchMs2ToMs3(psms[frac], reader) print(" Reporter intensities are being extracted from MS3 scans") progress = progressBar(len(ms2ToMs3[frac])) for ms2, ms3 in ms2ToMs3[frac].items(): progress.increment() reporterIntensity = getReporterIntensity(ms3, reader, params) key = os.path.basename(frac) + "_" + ms2 qdict[key] = reporterIntensity print() # Create a dataFrame after the first extraction of reporter m/z and intensity values reporters = params["tmt_reporters_used"].split(";") columnNames = [re.sub("sig", "mz", i) for i in reporters] + reporters