def load(): p = utils.getParams() p['inputDir'] = 'input/dream4' p['metaDataFile'] = 'meta_data.tsv' p['priorsFile'] = 'gold_standard.tsv' p['goldStandardFile'] = 'gold_standard.tsv' # NOTE: Currently does not work using 1 bootstrap. Please use 2 or more p['numBoots'] = 2 p['cores'] = 1 p['delTMax'] = 110 p['delTMin'] = 0 p['tau'] = 45 p['percTp'] = [50] * 4 p['permTp'] = [1] * 4 p['percFp'] = [0, 100, 250, 500] p['permFp'] = [1, 5, 5, 5] p['evalOnSubset'] = False p['method'] = 'BBSR' p['priorWeight'] = 1.26 p['saveToDir'] = 'output/dream4_BBSR_1' p['verbose'] = True p['demo'] = True p['exportCLRMatrix'] = True p['exportBSDR'] = True utils.setParams(p)
def paramsFlopsCounter(models,num_classes=10,input_shape=(3,32,32)): logger=get_logger("./") for modelname in models: model=get_models(modelname,num_classes=10) model = model.eval() pa1=getParams(model) fl1=getFlops(model,input_shape) fl2,pa2=get_model_complexity_info(model,input_shape,True) #logger.info("{} v1: {}--{} ".format(model,pa1,fl1)) logger.info("{} v1: {}--{} v2: {}--{}".format(modelname,pa1,fl1,pa2,fl2))
def getSData(self, index): #Data handling inside row #If clicked: copy CP #If Ctrl + Click: copy Params data = self.sModel.data(self.sModel.index(index.row(), 2), role=self.DR).strip() _modifiers = QApplication.keyboardModifiers() if data.strip(): if _modifiers == QtCore.Qt.ControlModifier: params = utils.getParams(data) if params: logging.info(' Copied Params: ' + params) #self.cmdLabel.setText('Copied Params: ' + params) self.cb.setText(params, mode=self.cb.Clipboard) else: cb_cp = utils.getCP(data) if cb_cp: logging.info(' Copied CP: ' + cb_cp) #self.cmdLabel.setText('Copied CP: ' + cb_cp) self.cb.setText(cb_cp, mode=self.cb.Clipboard)
def rec(target): processed.add(target) url = getUrl(target, True) params = getParams(target, '', True) #得到参数 if '=' in target: # if there's a = in the url, there should be GET parameters inps = [] for name, value in params.items(): inps.append({'name': name, 'value': value}) forms.append({0: {'action': url, 'method': 'get', 'inputs': inps}}) response = requester(url, params, headers, True, delay, timeout).text #retireJs(url, response)##检测<script>中是否存在漏洞 # if not skipDOM: # highlighted = dom(response) # clean_highlighted = ''.join([re.sub(r'^\d+\s+', '', line) for line in highlighted]) # if highlighted and clean_highlighted not in checkedDOMs: # checkedDOMs.append(clean_highlighted) # logger.good('Potentially vulnerable objects found at %s' % url) # logger.red_line(level='good') # for line in highlighted: # logger.no_format(line, level='good') # logger.red_line(level='good') forms.append(get_form(response)) #取出response中的所有form表单 matches = re.findall(r'<[aA].*href=["\']{0,1}(.*?)["\']', response) for link in matches: # iterate over the matches # remove everything after a "#" to deal with in-page anchors link = link.split('#')[0] if link.endswith(('.pdf', '.png', '.jpg', '.jpeg', '.xls', '.xml', '.docx', '.doc')): pass else: if link[:4] == 'http': if link.startswith(main_url): storage.add(link) elif link[:2] == '//': if link.split('/')[2].startswith(host): storage.add(schema + link) elif link[:1] == '/': storage.add(main_url + link) else: storage.add(main_url + '/' + link)
def tssig(self): """ Get the type info """ if self._client.server_handle is not None: self.reload() file = self.vim.current.buffer.name line = self.vim.current.window.cursor[0] offset = self.vim.current.window.cursor[1] + 1 info = self._client.getSignature(file, line, offset) if info: signatureHelpItems = list( map( lambda item: { 'variableArguments': item['isVariadic'], 'prefix': utils.convertToDisplayString(item[ 'prefixDisplayParts']), 'suffix': utils.convertToDisplayString(item[ 'suffixDisplayParts']), 'separator': utils.convertToDisplayString(item[ 'separatorDisplayParts']), 'parameters': list( map( lambda p: { 'text': utils.convertToDisplayString(p[ 'displayParts']), 'documentation': utils.convertToDisplayString(p[ 'documentation']), }, item['parameters'])) }, info['items'])) params = utils.getParams(signatureHelpItems[0]['parameters'], signatureHelpItems[0]['separator']) self.printHighlight(params) else: self.printError('Server is not running')
def btnGetData(self): # type = self.getRadioCodeType() freq = self.getRadioFreqType() year = self.getYearMonth() index_hs = self.getIndexHS() reporter = self.tab02_select_reporter.currentText() partner = self.tab03_select_partner.currentText() trade_flow = self.tab04_select_trade_flow.currentText() start_index = self.tab09_input_start_index.text() end_index = self.tab10_input_end_index.text() start_hs = self.tab11_input_start_hs.text() end_hs = self.tab12_input_end_hs.text() # token = self.tab01_input_token.text() check_input, message_input = checkInput(PeriodYear=year, StartIndex=start_index, EndIndex=end_index, StartHS=start_hs, EndHS=end_hs) check_select, message_select = checkSelect(Reporter=reporter, Partner=partner, TradeFlow=trade_flow) check_index, message_index = checkIndex(start_index, end_index) if self.tab06_radio_freq_month.isChecked( ) and not self.check_all.isChecked(): check_month, message_month = self.checkMonth() else: check_month = True message_month = '' if check_input and check_select and check_index and check_month: repaintText(self.text_message, 'Start get data ...') params = getParams(year, freq, reporter, partner, trade_flow) data = getData(params, start_index, end_index, start_hs, index_hs, self.text_message) message = dataToExcel(data) repaintText(self.text_message, message) else: repaintText( self.text_message, '{}{}{}{}'.format(message_input, message_select, message_index, message_month))
def tBoxChanged(self): #Handles the PACR type combox box signal change #If type is Remove, disallow any CP steps to be added or deleted if self.typeBox.currentText() == 'Remove': self.addStep.setEnabled(False) self.delStep.setEnabled(False) else: self.addStep.setEnabled(True) self.delStep.setEnabled(True) #Fills data if changing a step if self.typeBox.currentText() == 'Change': _row = int(self.stepEdit.text()) - 1 if _row < self.sModel.rowCount() + 1: record = self.sModel.record(_row) self.rationale.setPlainText(record.field(3).value()) step = record.field(2).value() cp = utils.getCP(step) if cp: cp = cp.replace('.prc', '') + '(' + utils.getParams(step) + ')' self.pSteps.populate(cp)
def tssig(self): """ Get type signature for symbol at cursor """ self.reload() file = self.vim.current.buffer.name line = self.vim.current.window.cursor[0] offset = self.vim.current.window.cursor[1] + 1 info = self._client.getSignature(file, line, offset) if info: signatureHelpItems = list(map(lambda item: { 'variableArguments': item['isVariadic'], 'prefix': utils.convertToDisplayString(item['prefixDisplayParts']), 'suffix': utils.convertToDisplayString(item['suffixDisplayParts']), 'separator': utils.convertToDisplayString(item['separatorDisplayParts']), 'parameters': list(map(lambda p: { 'text': utils.convertToDisplayString(p['displayParts']), 'documentation': utils.convertToDisplayString(p['documentation']), }, item['parameters'])) }, info['items'])) params = utils.getParams(signatureHelpItems[0][ 'parameters'], signatureHelpItems[0]['separator']) self.printHighlight(params)
def searchDatabase(features, paramFile, queue="standard"): # features: fully-aligned features (pandas DataFrame) # paramFile: paramter file ################################# # Preparation of job submission # ################################# m = features.shape[0] n = 10 # Default number of entries in each job if int( m / n ) > 200: # When there are too many features, limit the number of jobs to 200 n = int(m / 200) + 1 nJobs = math.ceil(m / n) # Create a temporary directory for jobs (to be removed later) and change the working directory for jobs cwd = os.getcwd() tmpDir = os.path.join(cwd, ".tmp") if os.path.exists(tmpDir): os.system("rm -rf " + tmpDir) os.mkdir(tmpDir) os.system("cp " + paramFile + " " + tmpDir) # Copy the parameter file to "tmpDir" os.chdir(tmpDir) # Change the working directory to a temporary one ################## # Job submission # ################## jobNumbers = [] mem = 1000 # Default memory reserved = 1000MB for i in range(nJobs): # Split features into "nJobs" chunks and use each chunk in each job start = n * i end = min(m, n * (i + 1)) featureFile = "features_" + str(i) + ".pickle" pickle.dump(features.iloc[start:end], open(featureFile, "wb")) # Submission of jobs to LSF jobNumber = submitJobs(i, featureFile, paramFile, mem, queue) jobNumbers.append(jobNumber) text = "\r {} job(s) is/are submitted".format(i + 1) sys.stdout.write(text) sys.stdout.flush() # Check the status of submitted jobs print() logging.info(" {} job(s) is/are submitted".format(nJobs)) checkJobStatus(jobNumbers) ######################################################## # Check unfinished jobs (due to not enough memory) and # # re-submission with the memory increase # ######################################################## print() print(" Checking unfinished jobs") logging.info("") logging.info(" Checking unfinished jobs") isFinished = False while not isFinished: jobNumbers = [] ii = 0 for i in range(nJobs): csvFile = "features_" + str(i) + ".csv" if not os.path.exists( csvFile ): # When a job is not finished properly, there's no corresponding .csv file # Extracation of the required memory by parsing .o file f = open("job_" + str(i) + ".o") lines = f.read() mem = int( re.search("(?<=Max Memory :)\s+(\d+)", lines).group(1)) * 2 # Times 2 for safety f.close() # Re-submission of jobs featureFile = "features_" + str(i) + ".pickle" jobNumber = submitJobs(i, featureFile, paramFile, mem, queue) jobNumbers.append(jobNumber) ii += 1 text = "\r {} job(s) is/are submitted".format(ii + 1) sys.stdout.write(text) sys.stdout.flush() logging.info(" {} job(s) is/are submitted".format(ii + 1)) # Check the status of submitted jobs if len(jobNumbers) > 0: print() checkJobStatus(jobNumbers) else: isFinished = True print() print(" All job(s) is/are finished") logging.info("") logging.info(" All job(s) is/are finished") ########################## # Postprocessing of jobs # ########################## res = pd.DataFrame() for i in range(nJobs): eachOutput = "features_" + str(i) + ".csv" try: df = pd.read_csv(eachOutput, sep="\t") except pd.errors.EmptyDataError: continue res = res.append(df, ignore_index=True) ################################ # Generation of an output file # ################################ os.chdir(cwd) # Move back to the "current working directory" params = utils.getParams(paramFile) filePath = os.path.join(os.getcwd(), "align_" + params["output_name"]) if not os.path.exists(filePath): os.mkdir(filePath) outputFile = os.path.join( filePath, "align_" + params["output_name"] + ".database_matches") res.to_csv(outputFile, sep="\t", index=False, na_rep="NA") # os.system("rm " + os.path.join(tmpDir, "features_*")) # os.system("rm " + os.path.join(tmpDir, "job_*")) return res
def alignFeatures(fArray, xmlFiles, paramFile): nFiles = len(xmlFiles) # Pandas dataframe to numpy structured array for internal computation for i in range(nFiles): fArray[i] = fArray[i].to_records(index=False) ################### # Load parameters # ################### params = utils.getParams(paramFile) # Features derived from feature files are stored in fArray. For example, # xmlFiles = [file1, file2, file3] # fArray[0] = features from file1 (which has column names like 'index', 'mz', etc.) # fArray[1] = features from file2 # ... # The array of m/z values from the first feature file can be accessed by fArray[0]['mz'] if nFiles > 1: # Multiple feature files -> alignment is required print(" Feature calibration") print(" ===================") logging.info(" Feature calibration") logging.info(" ===================") ################################### # Selection of a reference sample # ################################### if params["reference_feature"] == "0": # A run with the largest median of top 100 intensities is set to a reference run refNo = 0 refIntensity = 0 for i in range(nFiles): tmpIntensity = np.median(sorted(fArray[i]["intensity"], reverse=True)[0: 100]) if tmpIntensity >= refIntensity: refNo = i refIntensity = tmpIntensity elif params["reference_feature"] == "1": # A run with the most number of features is set to a reference run refNo = 0 refN = 0 for i in range(nFiles): tmpN = len(fArray[i]) if tmpN >= refN: refNo = i refN = tmpN else: try: refNo = xmlFiles.index(params["reference_feature"]) except: sys.exit(" 'reference_feature' parameter should be correctly specified") print(" %s is chosen as the reference run" % os.path.basename(xmlFiles[refNo])) logging.info(" %s is chosen as the reference run" % os.path.basename(xmlFiles[refNo])) ############################################################ # Calibration of features against those in a reference run # ############################################################ rtSdArray, mzSdArray = [], [] featureNames = [] for i in range(nFiles): featureName = os.path.basename(xmlFiles[i]) featureNames.append(featureName) if i != refNo: print(" " + featureName + " is being aligned against the reference run (it may take a while)") logging.info(" " + featureName + " is being aligned against the reference run (it may take a while)") fArray[i], rtSd, mzSd = calibrateFeatures(fArray[refNo], fArray[i], params) rtSdArray.append(rtSd) mzSdArray.append(mzSd) else: rtSdArray.append("NA") mzSdArray.append("NA") print(" Calibration summary") print(" ===================") print(" After calibration, RT- and m/z-shifts of each run (against the reference run) are centered to zero") print(" Variations (i.e. standard deviation) of RT- and m/z-shifts are as follows,") print(" Filename\t\t\t#features\tSD of RT-shifts [second]\tSD of m/z-shifts [ppm]") logging.info(" Calibration summary") logging.info(" ===================") logging.info(" After calibration, RT- and m/z-shifts of each run (against the reference run) are centered to zero") logging.info(" Variations (i.e. standard deviation) of RT- and m/z-shifts are as follows,") logging.info(" Filename\t\t\t#features\tSD of RT-shifts [second]\tSD of m/z-shifts [ppm]") for i in range(nFiles): nFeatures = str(fArray[i].shape[0]) if i != refNo: meanRtSd = "%.6f" % np.mean(rtSdArray[i]) meanMzSd = "%.6f" % np.mean(mzSdArray[i]) else: meanRtSd = "NA" meanMzSd = "NA" print(" " + featureNames[i] + "\t\t\t" + nFeatures + "\t" + meanRtSd + "\t" + meanMzSd) logging.info(" " + featureNames[i] + "\t\t\t" + nFeatures + "\t" + meanRtSd + "\t" + meanMzSd) print() logging.info("") ################################################################# # Identification of fully-aligned features for further analysis # ################################################################# print(" Feature alignment") print(" =================") logging.info(" Feature alignment") logging.info(" =================") fullFeatures, partialFeatures, unalignedFeatures = findMatchedFeatures(refNo, fArray, rtSdArray, mzSdArray, featureNames, params) else: print(" Since a single feature is used, the feature alignment is skipped") logging.info(" Since a single feature is used, the feature alignment is skipped") fullFeatures = np.copy(fArray[0]) # Masked array to 2D numpy array colNames = list(fullFeatures.dtype.names) featureName = os.path.splitext(os.path.basename(xmlFiles[0]))[0] fullFeatures.dtype.names = [featureName + "_" + c for c in colNames] partialFeatures, unalignedFeatures = None, None ################################################################ # Write fully-, partially- and/or un-aligned features to files # ################################################################ # At this step, fully-, partially- and unaligned features are written to files and saved # Also, those features are converted to pandas DataFrame format and returned dfFull, dfPartial, dfArrayUnaligned = utils.generateFeatureFile(fullFeatures, partialFeatures, unalignedFeatures, params) return dfFull, dfPartial, dfArrayUnaligned
import sys, os, re, logging, pandas as pd import utils from featureDetection import detectFeatures from datetime import datetime ################## # Initialization # ################## # For desktop debugging, paramFile = r"jumpm.params" inputFiles = [ r"/home/jcho/dev/spectralLibrary/FTLD_Batch2_F50.mzXML", r"/home/jcho/dev/spectralLibrary/FTLD_Batch2_F51.mzXML", r"/home/jcho/dev/spectralLibrary/FTLD_Batch2_F52.mzXML" ] params = utils.getParams(paramFile) skipScans = [1, 3, 5, 7, 10] for skipScan in skipScans: params["skipping_scans"] = skipScan logFile = "jump_m.log" if os.path.exists(logFile): os.system("rm " + logFile) logging.basicConfig(format='%(message)s', filename=logFile, level=logging.INFO) print() print(" Jump -m started") logging.info(" Jump -m started") now = datetime.now()
def detectFeatures(inputFile, paramFile): ############## # Parameters # ############## params = utils.getParams(paramFile) firstScan = int(params["first_scan_extraction"]) lastScan = int(params["last_scan_extraction"]) gap = int(params["skipping_scans"]) scanWindow = gap + 1 matchPpm = float(params["mass_tolerance_peak_matching"]) ################## # Initialization # ################## reader = mzxml.read(inputFile) f = [] # Feature array nFeatures = -1 cache = [] noise = {} # Empty dictionary for noise level information oldMinInd = -1 oldMaxInd = -1 ############################ # Get MS1 scan information # ############################ ms = [] with reader: msCount = 0 # filename = os.path.basename(inputFile) # print(" Extraction of MS1 spectra from %s" % filename) for spec in reader: msLevel = int(spec["msLevel"]) scanNum = int(spec["num"]) if msLevel == 1 and firstScan <= scanNum <= lastScan: ms.append(spec) msCount += 1 elif scanNum > lastScan: break # print(" Done") ################################ # Feature (3D-peak) generation # ################################ filename = os.path.basename(inputFile) print(" Feature detection from %s" % filename) logging.info(" Feature detection from " + filename) progress = utils.progressBar(msCount) for i in range(msCount): progress.increment() minInd = max(0, i - gap - 1) maxInd = min(msCount - 1, i + gap + 1) if i == 0: for j in range(maxInd + 1): spec = detectPeaks(ms[j], params) spec["index"] = j cache.append(spec) else: for j in range(oldMinInd, minInd): cache.pop(0) # Remove the first element in cache for j in range(oldMaxInd + 1, maxInd + 1): spec = detectPeaks(ms[j], params) spec["index"] = j cache.append(spec) ################## # Reduction step # ################## p = cache[i - minInd] pCount = len(p["m/z array"]) valids = np.array([]) count = 0 for j in range(pCount): cm = p["m/z array"][j] match = 0 nTry = 0 # Backward search for k in range(i - 1, minInd - 1, -1): q = cache[k - minInd] if q["m/z array"].size == 0: continue else: match, ind = getClosest(q, cm, matchPpm) if match == 1: break nTry += 1 if nTry > scanWindow: break if match == 0: # Forward search nTry = 0 for k in range(i + 1, maxInd + 1): q = cache[k - minInd] if q["m/z array"].size == 0: continue else: match, ind = getClosest(q, cm, matchPpm) if match == 1: break nTry += 1 if nTry > scanWindow: break if match == 1: valids = np.append(valids, j) # Peak reduction and noise-level estimation p, noise = reduceMS1(p, noise, valids) ##################### # Peak merging step # ##################### cache[i - minInd] = p pCount = len(p["m/z array"]) for j in range(pCount): cm = p["m/z array"][j] match = 0 nTry = 0 matchedPeakInd = [] # Backward search for k in range(i - 1, minInd - 1, -1): q = cache[k - minInd] if q["m/z array"].size == 0: continue else: matchIndicator, ind = getClosest(q, cm, matchPpm) # $matchIndicator = 1 means that the j-th (reduced) peak in the i-th scan # can form a 3D-peak with $ind-th (reduced) peak in the previous scan (%q) if matchIndicator == 1: matchedPeakInd.append(q["featureIndex"][ind]) match = 1 if match == 1: matchedPeakInd = list(set(matchedPeakInd)) # Make the list unique fInd = None if len(matchedPeakInd) > 1: # There are multiple matches to the peaks in previous scans fInd = min(matchedPeakInd) for m in matchedPeakInd: # Merge to the lowest indexed feature and remove the "merged" features if m != fInd: f[fInd]["mz"].extend(f[m]["mz"]) f[fInd]["intensity"].extend(f[m]["intensity"]) f[fInd]["num"].extend(f[m]["num"]) f[fInd]["rt"].extend(f[m]["rt"]) f[fInd]["index"].extend(f[m]["index"]) # Revise cache array for s in f[m]["index"]: for t in range(len(cache)): if cache[t]["index"] == s: for u in range(len(cache[t]["featureIndex"])): if cache[t]["featureIndex"][u] == m: cache[t]["featureIndex"][u] = fInd f[m] = None # Keep the size of feature array else: fInd = matchedPeakInd[0] if "featureIndex" in cache[i - minInd]: cache[i - minInd]["featureIndex"].append(fInd) else: cache[i - minInd]["featureIndex"] = [fInd] f[fInd]["mz"].append(p["m/z array"][j]) f[fInd]["intensity"].append(p["intensity array"][j]) f[fInd]["num"].append(p["num"]) f[fInd]["rt"].append(p["retentionTime"]) f[fInd]["index"].append(p["index"]) if match != 1: if i < msCount: nFeatures += 1 if "featureIndex" in cache[i - minInd]: cache[i - minInd]["featureIndex"].append(nFeatures) else: cache[i - minInd]["featureIndex"] = [nFeatures] f.append({"mz": [p["m/z array"][j]], "intensity": [p["intensity array"][j]], "num": [p["num"]], "rt": [p["retentionTime"]], "index": [i]}) oldMinInd = minInd oldMaxInd = maxInd # Remove empty features f = [i for i in f if i is not None] ################################# # Filtering features (3D-peaks) # ################################# # A feature may contain multiple peaks from one scan # In this case, one with the largest intensity is chosen gMinRt, gMaxRt = 0, 0 # Global minimum and maximum RT over all features for i in range(len(f)): if len(f[i]["num"]) != len(list(set(f[i]["num"]))): temp = {} for j in range(len(f[i]["num"])): if f[i]["num"][j] in temp: currIntensity = f[i]["intensity"][j] if currIntensity > temp[f[i]["num"][j]]["intensity"]: temp[f[i]["num"][j]]["intensity"] = currIntensity temp[f[i]["num"][j]]["index"] = j else: temp[f[i]["num"][j]] = {} temp[f[i]["num"][j]]["intensity"] = f[i]["intensity"][j] temp[f[i]["num"][j]]["index"] = j uInd = [] for key in sorted(temp.keys()): uInd.append(temp[key]["index"]) f[i]["mz"] = [f[i]["mz"][u] for u in uInd] f[i]["intensity"] = [f[i]["intensity"][u] for u in uInd] f[i]["num"] = [f[i]["num"][u] for u in uInd] f[i]["rt"] = [f[i]["rt"][u] for u in uInd] f[i]["index"] = [f[i]["index"][u] for u in uInd] if i == 0: gMinRt = min(f[i]["rt"]) gMaxRt = max(f[i]["rt"]) else: if min(f[i]["rt"]) < gMinRt: gMinRt = min(f[i]["rt"]) if max(f[i]["rt"]) > gMaxRt: gMaxRt = max(f[i]["rt"]) if gMaxRt.unit_info == "minute": gMaxRt = gMaxRt * 60 gMinRt = gMinRt * 60 ################################### # Organization of output features # ################################### n = 0 ms1ToFeatures = {} for i in range(len(f)): # 1. mz: mean m/z of a feauture = weighted average of m/z and intensity mz = np.sum(np.multiply(f[i]["mz"], f[i]["intensity"])) / np.sum(f[i]["intensity"]) # 2. intensity: intensity of a feature (maximum intensity among the peaks consist of the feature) intensity = max(f[i]["intensity"]) # 3. z: charge of the feature, set to 1 now, but modified later z = 1 isotope = 0 # Will be used later # 4. RT: RT of the representative peak (i.e. strongest peak) of a feature ind = np.argmax(f[i]["intensity"]) rt = f[i]["rt"][ind] # 5. minRT and maxRT minRt = min(f[i]["rt"]) maxRt = max(f[i]["rt"]) # Conversion of RT to the unit of second if rt.unit_info == "minute": rt = rt * 60 # Convert to the unit of second minRt = minRt * 60 maxRt = maxRt * 60 # 6. MS1 scan number of the representative peak of a feature ms1 = f[i]["num"][ind] # 7. minMS1 and maxMS1 minMs1 = min(list(map(int, f[i]["num"]))) maxMs1 = max(list(map(int, f[i]["num"]))) # 8. SNratio (signal-to-noise ratio of the feature) if ms1 in noise: noiseLevel = noise[ms1] else: noiseLevel = 500 snRatio = intensity / noiseLevel featureIntensityThreshold = noiseLevel * float(params["signal_noise_ratio"]) if intensity >= featureIntensityThreshold: # 9. Percentage of true feature pctTF = (maxRt - minRt) / (gMaxRt - gMinRt) * 100 # Organize features in a structured numpy array form if n == 0: features = np.array([(mz, intensity, z, rt, minRt, maxRt, ms1, minMs1, maxMs1, snRatio, pctTF, isotope)], dtype="f8, f8, f8, f8, f8, f8, f8, f8, f8, f8, f8, f8") n += 1 else: features = np.append(features, np.array([(mz, intensity, z, rt, minRt, maxRt, ms1, minMs1, maxMs1, snRatio, pctTF, isotope)], dtype=features.dtype)) for j in range(len(f[i]["num"])): num = f[i]["num"][j] if num not in ms1ToFeatures: ms1ToFeatures[num] = {"mz": [f[i]["mz"][j]], "intensity": [f[i]["intensity"][j]]} else: ms1ToFeatures[num]["mz"].append(f[i]["mz"][j]) ms1ToFeatures[num]["intensity"].append(f[i]["intensity"][j]) else: continue features.dtype.names = ("mz", "intensity", "z", "RT", "minRT", "maxRT", "MS1", "minMS1", "maxMS1", "SNratio", "PercentageTF", "isotope") ########################## # Decharging of features # ########################## features = dechargeFeatures(features) # print() ############################################ # Convert the features to pandas dataframe # # Write features to a file # ############################################ df = pd.DataFrame(features) df = df.drop(columns = ["isotope"]) # "isotope" column was internally used, and no need to be transferred # Create a subdirectory and save features to a file baseFilename = os.path.splitext(os.path.basename(filename))[0] # i.e. filename without extension featureDirectory = os.path.join(os.getcwd(), baseFilename) if not os.path.exists(featureDirectory): os.mkdir(featureDirectory) # # Increment the number of a feature file # if len(glob.glob(os.path.join(featureDirectory, baseFilename + ".*.feature"))) == 0: # featureFilename = os.path.splitext(os.path.basename(filename))[0] + ".1.feature" # else: # oldNo = 0 # for f in glob.glob(os.path.join(featureDirectory, baseFilename + ".*.feature")): # oldNo = max(oldNo, int(os.path.basename(f).split(".")[-2])) # featureFilename = baseFilename + "." + str(int(oldNo) + 1) + ".feature" # featureFilename = os.path.join(featureDirectory, featureFilename) # Simply overwrite any existing feature file # Individual feature file still needs to be located in an input file-specific location # since the feature file can be directly used later featureFilename = baseFilename + ".feature" featureFilename = os.path.join(featureDirectory, featureFilename) df.to_csv(featureFilename, index = False, sep = "\t") return df # Pandas DataFrame
import sys sys.path.append('scripts/') sys.path.append('jobs/') from imports import * import utils from desAndRes import designAndResponse from priors import getPriors from groupPredictors import groupPredictors from miAndClr import mi, mixedCLR from bayesianRegression import BBSR utils.loadJob('default') utils.loadJob(sys.argv[1]) pars = utils.getParams() random.seed(pars['jobSeed']) # Read in data data = utils.readInput( pars['inputDir'], pars['expMatFile'], pars['tfNamesFile'], pars['metaDataFile'], pars['priorsFile'], pars['goldStandardFile']) # Generate design and response matricies desResp = designAndResponse( data['metaData'], data['expMat'], pars['delTMin'], pars['delTMax'], pars['tau']) # Generate priors priors = getPriors( data['expMat'], data['tfNames'],
def ms2ForFeatures(full, mzxmlFiles, paramFile): print(" Identification of MS2 spectra for the features") print(" ==============================================") logging.info(" Identification of MS2 spectra for the features") logging.info(" ==============================================") full = full.to_records( index=False ) # Change pd.DataFrame to np.RecArray for internal computation (speed issue) ###################################### # Load parameters and initialization # ###################################### params = utils.getParams(paramFile) # ppiThreshold = "max" # Hard-coded ppiThreshold = params["ppi_threshold_of_features"] pctTfThreshold = float(params["max_percentage_RT_range"]) tolIsolation = float(params["isolation_window"]) tolPrecursor = float(params["tol_precursor"]) tolIntraMS2Consolidation = float(params["tol_intra_ms2_consolidation"]) tolInterMS2Consolidation = float(params["tol_inter_ms2_consolidation"]) nFeatures = len(full) nFiles = len(mzxmlFiles) featureToScan = np.empty((nFeatures, nFiles), dtype=object) featureToSpec = np.empty((nFeatures, nFiles), dtype=object) ################################################# # Assignment of MS2 spectra to features # # Consolidation of MS2 spectra for each feature # ################################################# m = -1 # Index for input files for file in mzxmlFiles: m += 1 reader = mzxml.MzXML(file) fileBasename, _ = os.path.splitext(os.path.basename(file)) colNames = [ item for item in full.dtype.names if item.startswith(fileBasename + "_") ] subset = full[colNames] subset.dtype.names = [s.split("_")[-1] for s in subset.dtype.names] ms2Dict = {} minScan, maxScan = int(np.nanmin(subset["minMS1"])), int( np.nanmax(subset["maxMS1"])) progress = utils.progressBar(maxScan - minScan + 1) print(" %s is being processed" % os.path.basename(file)) print(" Looking for MS2 scan(s) responsible for each feature") logging.info(" %s is being processed" % os.path.basename(file)) logging.info(" Looking for MS2 scan(s) responsible for each feature") for i in range(minScan, maxScan + 1): progress.increment() spec = reader[str(i)] msLevel = spec["msLevel"] if msLevel == 1: surveyNum = i elif msLevel == 2: # Find MS2 scans which satisfy the following conditions # From the discussion around June 2020, # 1. In ReAdW-derived mzXML files, precursor m/z values are in two tags: "precursorMz" and "filterLine" # 2. Through Haiyan's manual inspection, the real precursor m/z value is closer to one in "filterLine" tag # 3. So, in this script, precursor m/z of MS2 scan is obtained from "filterLine" tag # 4. Note that it may be specific to ReAdW-derived mzXML files since MSConvert-derived mzXML files do not have "filterLine" tag # 4.1. In this case, maybe the use of mzML (instead of mzXML) would be a solution (to-do later) # precMz = spec["precursorMz"][0]["precursorMz"] # Precursor m/z from "precursorMz" tag p = re.search("([0-9.]+)\\@", spec["filterLine"]) precMz = float(p.group(1)) survey = reader[str(surveyNum)] fInd = np.where((surveyNum >= subset["minMS1"]) & (surveyNum <= subset["maxMS1"]) & (subset["mz"] >= (precMz - tolIsolation)) & (subset["mz"] <= (precMz + tolIsolation)) & (subset["PercentageTF"] <= pctTfThreshold))[0] if len(fInd) > 0: ppi = [] for i in range(len(fInd)): mz = subset["mz"][fInd[i]] lL = mz - mz * tolPrecursor / 1e6 uL = mz + mz * tolPrecursor / 1e6 ind = np.where((survey["m/z array"] >= lL) & (survey["m/z array"] <= uL))[0] if len(ind) > 0: ppi.append(np.max(survey["intensity array"][ind])) else: ppi.append(0) if sum(ppi) == 0: continue ppi = ppi / np.sum( ppi) * 100 # Convert intensities to percentage values if ppiThreshold == "max": fInd = np.array([fInd[np.argmax(ppi)]]) else: # ppiThreshold should be a numeric value ppiThreshold = float(ppiThreshold) fInd = fInd[np.where(ppi > ppiThreshold)] if len(fInd ) == 0: # Last check of candidate feature indexes continue else: # Add this MS2 scan information to ms2Dict ms2Dict[spec["num"]] = {} ms2Dict[spec["num"]]["mz"] = spec["m/z array"] ms2Dict[ spec["num"]]["intensity"] = spec["intensity array"] # Mapping between features and MS2 scan numbers for i in range(len(fInd)): if featureToScan[fInd[i], m] is None: featureToScan[fInd[i], m] = spec["num"] else: featureToScan[fInd[i], m] += ";" + spec["num"] print( " Merging MS2 spectra for each feature within a run (it may take a while)" ) logging.info( " Merging MS2 spectra for each feature within a run (it may take a while)" ) progress = utils.progressBar(nFeatures) for i in range(nFeatures): progress.increment() if featureToScan[i, m] is not None: spec = intraConsolidation(ms2Dict, featureToScan[i, m], tolIntraMS2Consolidation) featureToSpec[i, m] = spec print( " Merging MS2 spectra for each feature between runs when there are multiple runs" ) print( " Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks" ) logging.info( " Merging MS2 spectra for each feature between runs when there are multiple runs" ) logging.info( " Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks" ) specArray = np.array([]) progress = utils.progressBar(nFeatures) for i in range(nFeatures): progress.increment() if np.sum(featureToSpec[i] == None) == nFiles: specArray = np.append(specArray, None) else: spec = interConsolidation(featureToSpec[i, :], tolInterMS2Consolidation) specArray = np.append(specArray, spec) ############################### # MS2 processing for features # ############################### # "specArray" is the list of (consolidated) MS2 spectra # specArray[i] is the MS2 spectrum corresponding to the i-th feature # If there's no MS2 spectrum, then specArray[i] is None df = utils.summarizeFeatures(full, params) # Add the mean m/z of feature and its charge state to the beginning of MS2 spectrum (similar to .dta file) for i in range(nFeatures): if specArray[i] is not None: specArray[i]["mz"] = np.insert(specArray[i]["mz"], 0, df["feature_m/z"].iloc[i]) specArray[i]["intensity"] = np.insert(specArray[i]["intensity"], 0, df["feature_z"].iloc[i]) df["MS2"] = specArray df = df.sort_values( by="feature_m/z", ignore_index=True) # Features are sorted by "feature_m/z" df.insert(loc=0, column="feature_num", value=df.index + 1) # df["feature_num"] = df.index + 1 # Update "feature_num" according to the ascending order of "feature_m/z" (as sorted) # Write MS2 spectra to files filePath = os.path.join(os.getcwd(), "align_" + params["output_name"]) ms2Path = os.path.join(filePath, "MS2") if not os.path.exists(ms2Path): os.mkdir(ms2Path) for i in range(df.shape[0]): if df["MS2"].iloc[i] is not None: fileName = os.path.join(ms2Path, "f" + str(i + 1) + ".MS2") dfMS2 = pd.DataFrame.from_dict(df["MS2"].iloc[i]) dfMS2.to_csv(fileName, index=False, header=False, sep="\t") # Save fully-aligned features with their MS2 spectra (i.e. res) for debugging purpose # When the pipeline gets mature, this part needs to be removed pickle.dump(df, open(os.path.join(filePath, ".fully_aligned_feature.pickle"), "wb")) # Make the file be hidden ########################## # Handling mzXML file(s) # ########################## # Move mzXML files to the directory(ies) where individual .feature files are located if params["skip_feature_detection"] == "0": for file in mzxmlFiles: baseFilename = os.path.basename(file) featureDirectory = os.path.join(os.getcwd(), os.path.splitext(baseFilename)[0]) os.rename(file, os.path.join(featureDirectory, baseFilename)) return df, featureToScan
def searchLibrary(full, paramFile): ################################## # Load parameters and initialize # ################################## try: params = utils.getParams(paramFile) except: sys.exit("Parameter file cannot be found or cannot be loaded") condition = params["LC_column"].lower() if params["mode"] == "1": condition = condition + "p" elif params["mode"] == "-1": condition = condition + "n" else: sys.exit("'mode' parameter should be either 1 or -1") proton = 1.007276466812 matchMzTol = float(params["library_mass_tolerance"]) # Unit of ppm adducts = adductDictionary(params) nFeatures = full.shape[0] # While full["feature_RT"] has the unit of minute, the library compounds have RTs in the unit of second # So, within this function, full["feature_RT"] needs to be converted to the unit of second full["feature_RT"] = full["feature_RT"] * 60 ########################## # Perform library search # ########################## allRes = pd.DataFrame() nLibs = 1 for libFile in params["library"]: doAlignment = int(params["library_rt_alignment"]) print(" Library {} is being loaded".format(os.path.basename(libFile))) logging.info(" Library {} is being loaded".format( os.path.basename(libFile))) try: conn = sqlite3.connect(libFile) except: sys.exit("Library file cannot be found or cannot be loaded.") ##################################################### # RT-alignment between features and library entries # ##################################################### # Check whether 'rt' column of the library is numeric value or not hasNumericRt = 0 cursor = conn.execute("PRAGMA table_info(library)") pragma = cursor.fetchall() for row in pragma: if row[1].lower() == "rt": if row[2].lower() == "real": hasNumericRt = 1 break # RT-alignment if doAlignment == 1: if hasNumericRt == 1: print( " RT-alignment is being performed between features and library compounds" ) logging.info( " RT-alignment is being performed between features and library compounds" ) x, y = prepRtAlignment(full, conn, params) mod = rtAlignment(x, y) if mod == -1: print( " Since there are TOO FEW feature RTs comparable to library RTs, RT-alignment is skipped" ) logging.info( " Since there are TOO FEW feature RTs comparable to library RTs, RT-alignment is skipped" ) doAlignment = 0 else: # Calibration of features' RT rPredict = ro.r("predict") full["feature_calibrated_RT"] = None full["feature_calibrated_RT"] = full[ "feature_RT"] - rPredict( mod, FloatVector(full["feature_RT"])) # Empirical CDF of alignment (absolute) residuals (will be used to calculate RT shift-based scores) ecdfRt = ECDF(abs(np.array(mod.rx2("residuals")))) else: print( " Although the parameter is set to perform RT-alignment against the library, there are no valid RT values in the library" ) print(" Therefore, RT-alignment is not performed") logging.info( " Although the parameter is set to perform RT-alignment against the library, there are no valid RT values in the library" ) logging.info(" Therefore, RT-alignment is not performed") doAlignment = 0 else: print( " According to the parameter, RT-alignment is not performed between features and library compounds" ) logging.info( " According to the parameter, RT-alignment is not performed between features and library compounds" ) ######################################## # Match features and library compounds # ######################################## # Match features and library compounds print(" Features are being compared with library compounds") logging.info(" Features are being compared with library compounds") res = { "no": [], "feature_index": [], "feature_m/z": [], "feature_original_RT": [], "feature_aligned_RT": [], "id": [], "other_id": [], "formula": [], "name": [], "ion": [], "RT": [], "SMILES": [], "InchiKey": [], "collision_energy": [], "RT_shift": [], "RT_score": [], "MS2_score": [], "combined_score": [] } intensityCols = [ col for col in full.columns if col.lower().endswith("_intensity") ] for c in intensityCols: res[c] = [] n = 0 progress = utils.progressBar(nFeatures) for i in range(nFeatures): progress.increment() # Feature information fZ = full["feature_z"].iloc[i] fSpec = full["MS2"].iloc[i] if np.isnan( fZ ) or fSpec is None: # When MS2 spectrum of the feature is not defined, skip it continue fMz = full["feature_m/z"].iloc[i] fRt = full["feature_RT"].iloc[i] fIntensity = full[intensityCols].iloc[i] if params["mode"] == "1": # Positive mode fMass = fZ * (fMz - proton) elif params["mode"] == "-1": # Negative mode fMass = fZ * (fMz + proton) # Retrieve library compounds of which neutral masses are similar to feature mass df = queryLibrary(fMz, fMass, fZ, conn, adducts, matchMzTol) if not df.empty: colNameOtherId = df.filter(regex="other_ids").columns[0] for j in range(df.shape[0]): # When there is/are library compound(s) matched to the feature, # MS2 of the library compound(s) should be retrieved uid = df["id"].iloc[j] uid = uid.replace("##Decoy_", "") sqlQuery = r"SELECT * FROM {}".format(uid) try: libSpec = pd.read_sql_query(sqlQuery, conn) except: continue if not libSpec.empty: n += 1 # Calculate the score based on MS2 spectrum libSpec = libSpec.to_dict(orient="list") simMs2 = calcMS2Similarity(fSpec, libSpec, params) pMs2 = 1 - simMs2 # p-value-like score (the smaller, the better) pMs2 = max(np.finfo(float).eps, pMs2) # Prevent the underflow caused by 0 # Calculate the (similarity?) score based on RT-shift if doAlignment == 1: fAlignedRt = full["feature_calibrated_RT"].iloc[i] rtShift = fAlignedRt - df["rt"].iloc[j] pRt = ecdfRt( abs(rtShift) ) # Also, p-value-like score (the smaller, the better) pRt = max(np.finfo(float).eps, pRt) simRt = 1 - pRt # p = 1 / (0.5 / pMS2 + 0.5 / pRt) # Combined p-value using harmonic mean with equal weights p = 1 - stats.chi2.cdf( -2 * (np.log(pMs2) + np.log(pRt)), 4) # Fisher's method # p = -2 * (np.log(pMs2) + np.log(pRt)) # Fisher's method used in Perl pipeline (the smaller, the better) else: fAlignedRt = "NA" if hasNumericRt == 1 and df["rt"].iloc[ j] is not None: rtShift = fRt - df["rt"].iloc[j] else: rtShift = "NA" # pRt = 1 simRt = "NA" p = pMs2 # Output libId = df["id"].iloc[j] libOtherId = df[colNameOtherId].iloc[j] libFormula = df["formula"].iloc[j] libName = df["name"].iloc[j] if hasNumericRt == 1: libRt = df["rt"].iloc[j] else: libRt = "NA" libIon = df["ion_type"].iloc[j] libSmiles = df["smiles"].iloc[j] libInchiKey = df["inchikey"].iloc[j] libEnergy = df["collision_energy"].iloc[j] res["no"].append(n) res["feature_index"].append(i + 1) res["feature_m/z"].append(fMz) res["feature_original_RT"].append( fRt / 60) # For output, the unit of RT is minute if doAlignment == 1: res["feature_aligned_RT"].append(fAlignedRt / 60) else: res["feature_aligned_RT"].append(fAlignedRt) for c in intensityCols: res[c].append(fIntensity[c]) res["id"].append(libId) res["other_id"].append(libOtherId) res["formula"].append(libFormula) res["name"].append(libName) res["ion"].append(libIon) if hasNumericRt == 1: res["RT"].append(libRt / 60) else: res["RT"].append(libRt) res["SMILES"].append(libSmiles) res["InchiKey"].append(libInchiKey) res["collision_energy"].append(libEnergy) if rtShift != "NA": rtShift = abs(rtShift) / 60 # Convert to "minute" res["RT_shift"].append(rtShift) # Haiyan's preference # RT_score and MS2_score: 0 ~ 1 (bad to good) res["RT_score"].append(simRt) res["MS2_score"].append(simMs2) res["combined_score"].append(abs(-np.log10(p))) conn.close() res = pd.DataFrame.from_dict(res) resCols = ["no", "feature_index", "feature_m/z", "feature_original_RT", "feature_aligned_RT"] + intensityCols + \ ["id", "other_id", "formula", "name", "ion", "RT", "SMILES", "InchiKey", "collision_energy", "RT_shift", "RT_score", "MS2_score", "combined_score"] res = res[resCols] res = res.rename(columns={"other_id": colNameOtherId}) filePath = os.path.join(os.getcwd(), "align_" + params["output_name"]) outputFile = os.path.join( filePath, "align_" + params["output_name"] + "." + str(nLibs) + ".library_matches") res.to_csv(outputFile, sep="\t", index=False) allRes = allRes.append(res, ignore_index=True) nLibs += 1 # RT unit of "full" needs to be converted back to minute for subsequent procedures (i.e. database search) full["feature_RT"] = full["feature_RT"] / 60 return allRes