def calculate_sample_means(data, parameters): # type: (LFDataFrame, LFParameters) -> None """Calculate and add the mean of the intensity of each sample replicates in the input dataframe. Keyword Arguments: data -- LFDataFrame instance parameters -- LipidFinder's PeakFilter parameters instance """ # Column index of first sample replicate startIndex = parameters['firstSampleIndex'] - 1 # Column index of last sample replicate endIndex = startIndex + (parameters['numSamples'] * parameters['numTechReps']) if (parameters['numTechReps'] == 1): # The mean of a single replicate is the replicate itself, so the # mean column will have a copy of the single sample replicate for firstIndex in range(startIndex, endIndex): colName = data.columns[firstIndex] + '_mean' data[colName] = data.iloc[:, firstIndex].astype(float).round( 0).astype(int) else: for firstIndex in range(startIndex, endIndex, parameters['numTechReps']): lastIndex = firstIndex + parameters['numTechReps'] # Create the column name for the mean of the current sample colName = re.sub('\d+$', "", data.columns[firstIndex]) + '_mean' # Get means (not taking into account zeros) of the sample rawMeans = data.iloc[:, firstIndex:lastIndex].apply( lambda x: x.sum() / (x.astype(bool).sum() if (x.astype(bool).sum()) else 1), axis=1) # Round to nearest integer, cast to integer and insert # sample means into the dataframe data[colName] = rawMeans.round(0).astype("int64")
def remove_isotopes(data, parameters): # type: (LFDataFrame, LFParameters) -> None """Remove isotopes of parent analytes. Keyword Arguments: data -- LFDataFrame instance parameters -- LipidFinder's PeakFilter parameters instance """ mzCol = parameters['mzCol'] rtCol = parameters['rtCol'] # Calculate the location of sample columns based on the current # state of the dataframe (before adding isotope annotation) firstSampleCol = len(data.columns) - parameters['numSamples'] lastSampleCol = len(data.columns) for i in range(firstSampleCol, lastSampleCol): # Create an array from 'data' with m/z, retention time, the # samples' intensity mean and index per row array = numpy.stack((data[mzCol].values, data[rtCol].values, data.iloc[:, i], data.iloc[:, 0].values), axis=-1) tagArray = _detect_sample_isotopes(array, parameters) # Set the intensity of the sample detected isotopes to 0 colName = data.columns[i] isoColName = colName + '_isotopes' data.insert(len(data.columns), isoColName, tagArray) if (parameters['removeIsotopes']): data.loc[data[isoColName].str.contains('M\+'), colName] = 0.0 if (parameters['removeIsotopes']): # Drop empty frames, i.e. isotope frames found in every sample data.drop_empty_frames( 'Isotope removal (isotopes found in every sample)', parameters, True)
def __process_feature__(featureCluster, parameters, means): # type: (pandas.DataFrame, LFParameters, bool) -> pandas.DataFrame """Correct retention time misalignment in the given feature cluster. Keyword Arguments: featureCluster -- frames with the same feature cluster ID parameters -- LipidFinder's PeakFilter parameters instance means -- perform the correction over mean columns instead of each sample replicate? """ if (len(featureCluster) == 1): return featureCluster if (means): # The sample means for the feature cluster tmpData = featureCluster.iloc[:, -parameters['numSamples'] : ].copy() # Get the index of frames with at least 1 column with a non-zero # intensity nonZeroIndices = numpy.where(tmpData.sum(axis=1) > 0)[0] if (nonZeroIndices.size > 1): # Get array of retention times (RT) rtArray = featureCluster[parameters['rtCol']].values # Get an array of the time difference to next frame rtDiff = numpy.roll(rtArray[nonZeroIndices], -1) \ - rtArray[nonZeroIndices] # Get the array of intensities for the frames with at least # 1 column with a non-zero intensity intensity = tmpData.values[nonZeroIndices] __process_sample__(intensity, rtDiff, parameters, parameters['numSamples']) # Replace old values with the new ones tmpData.values[nonZeroIndices] = intensity featureCluster.iloc[:, -parameters['numSamples'] : ] = tmpData else: firstSampleIndex = parameters['firstSampleIndex'] - 1 lastSampleIndex = firstSampleIndex + (parameters['numSamples'] * parameters['numTechReps']) # Get array of RTs rtArray = featureCluster[parameters['rtCol']].values # Loop through each set of replicates per sample for firstIndex in range(firstSampleIndex, lastSampleIndex, parameters['numTechReps']): lastIndex = firstIndex + parameters['numTechReps'] tmpData = featureCluster.iloc[:, firstIndex : lastIndex].copy() # Get the index of frames with at least 1 replicate with a # non-zero intensity nonZeroIndices = numpy.where(tmpData.sum(axis=1) > 0)[0] if (nonZeroIndices.size > 1): # Get an array of the time difference to next frame rtDiff = numpy.roll(rtArray[nonZeroIndices], -1) \ - rtArray[nonZeroIndices] # Get the array of intensities for the frames with at least # 1 replicate with a non-zero intensity intensity = tmpData.values[nonZeroIndices] __process_sample__(intensity, rtDiff, parameters, parameters['numTechReps']) # Replace old values with the new ones tmpData.values[nonZeroIndices] = intensity featureCluster.iloc[:, firstIndex : lastIndex] = tmpData return featureCluster
def remove_outliers(data, parameters, src='samples'): # type: (LFDataFrame, LFParameters) -> None """Removes outliers from a set of replicates on a row by row basis. All sample replicates may be discarded if the relative standard deviation (RSD) of the remaining replicates cannot be reduced below the established threshold. Keyword Arguments: data -- LFDataFrame instance parameters -- LipidFinder's PeakFilter parameters instance src -- columns where to check for outliers: "samples" or "blanks" [default: "samples"] """ if (src not in ['samples', 'blanks']): raise ValueError('Unexpected value. Options: samples, blanks') # Set the corresponding values regarding the columns to evaluate if (src == 'samples'): startIndex = parameters['firstSampleIndex'] - 1 endIndex = startIndex + (parameters['numSamples'] * parameters['numTechReps']) repsPerGroup = parameters['numTechReps'] else: startIndex = parameters['firstSampleIndex'] \ + (parameters['numSamples'] * parameters['numTechReps']) \ + parameters['numQCReps'] - 1 endIndex = startIndex + parameters['numSolventReps'] repsPerGroup = parameters['numSolventReps'] # Add dummy row to avoid unexpected behavior when using apply(): "In # the current implementation, apply calls func twice on the first # column/row to decide whether it can take a fast or slow code # path." tmpData = data.iloc[0, :].to_frame().transpose() tmpData = tmpData.append(data, ignore_index=True) # Loop through each set of replicates per sample, in each case # slicing out and processing 1 sample's replicate for firstIndex in range(startIndex, endIndex, repsPerGroup): lastIndex = firstIndex + repsPerGroup tmpData.iloc[:, firstIndex : lastIndex] = \ tmpData.iloc[:, firstIndex : lastIndex].apply( __reps_frame__, axis=1, parameters=parameters) # Copy to data the new replicates values after removing the first # dummy row tmpData = tmpData.iloc[1:] tmpData.index = tmpData.index - 1 data.iloc[:, startIndex : endIndex] = \ tmpData.iloc[:, startIndex : endIndex] # Drop empty frames (if any) data.drop_empty_frames('Empty frames after Outlier Correction', parameters)
def rm_full_frags( array, # type: numpy.ndarray fragments, # type: pandas.DataFrame parameters # type: LFParameters ): # type: (...) -> list[float] """Return an index list corresponding to common in-source fragments in the given sample array. Return the index list of all 'array' features that match the m/z values provided in 'fragments' for which there is at least another feature above the given m/z cut-off at the same retention time (RT). All m/z and RT matching are computed within tolerance. Keyword arguments: array -- array with m/z, RT and index of the original dataframe fragments -- in-source fragments to be removed parameters -- LipidFinder's PeakFilter parameters instance """ # Create an array with one in-source fragment m/z cut-off and m/z # offset per row fragsArray = numpy.stack( (fragments['MZ'].values, fragments['MZCutOff'].values), axis=-1) fragsIndex = [] for fragMZ, mzCutOff in fragsArray: mzRange = mz_tol_range(fragMZ, parameters['mzFixedError'], parameters['mzPPMError']) # Get the index of 'array' features that match the in-source # fragment m/z value mzMatches = numpy.searchsorted(array[:, 0], mzRange) if (mzMatches[0] == mzMatches[1]): continue for index in range(mzMatches[0], mzMatches[1]): # To be a match, each feature must have the same RT minRT, maxRT = rt_tol_range(array[index, 1], RT_TOLERANCE) rtMatches = numpy.where((array[:, 0] >= mzCutOff) & (array[:, 1] >= minRT) & (array[:, 1] <= maxRT))[0] if (len(rtMatches) > 0): # Mark the feature as an in-source fragment fragsIndex.append(index) return fragsIndex
def cluster_by_features(data, parameters): # type: (LFDataFrame, LFParameters) -> None """Cluster contiguous ions within the same mass cluster where each member is separated by a retention time difference of less than 'maxRTDiffAdjFrame' (in 'parameters'). Feature clusters are identified and each assigned an arbitrary unique integer identifier. Keyword Arguments: data -- LFDataFrame instance parameters -- LipidFinder's PeakFilter parameters instance """ firstRepIndex = parameters['firstSampleIndex'] - 1 mzCol = parameters['mzCol'] rtCol = parameters['rtCol'] # Re-sort dataframe ready for feature clustering data.sort_values(by=['mzClusterID', rtCol, mzCol], inplace=True, kind='mergesort') # Reset index data.reset_index(inplace=True, drop=True) # Create a new dataframe with auxiliary information: # "TimeDiff": retention time difference between current and next # frames auxData = pandas.DataFrame( {'TimeDiff': data[rtCol].shift(-1) - data[rtCol]}) # Assign a feature cluster ID to each cluster of contiguous # ions within the same mass cluster where each member is separated # by a retention time difference of less than 'maxRTDiffAdjFrame' data['FeatureClusterID'] = numpy.nan timeDiffs = auxData['TimeDiff'].values mzClusterIDs = data['mzClusterID'].values featureClusterIDs = data['FeatureClusterID'].values id = 1 numRowsData = len(data) for index in range(0, numRowsData - 1): featureClusterIDs[index] = id if ((mzClusterIDs[index] != mzClusterIDs[index + 1]) or (timeDiffs[index] > parameters['maxRTDiffAdjFrame'])): id += 1 featureClusterIDs[numRowsData - 1] = id
def get_fdr(data, parameters): # type: (LFDataFrame, LFParameters) -> float """Return the False Discovery Rate (FDR) of the dataset following a target-decoy strategy. The value is calculated based on the number of m/z values of 'data' found in the COMP_DB database from LIPID MAPS, and the number of m/z values of 'data' found in a decoy database, created adding 0.5 Da to every m/z in COMP_DB (a very rare lipid mass defect). FDR is equal to the number of decoy hits divided by the number of target hits. Keyword arguments: data -- LFDataFrame instance parameters -- LipidFinder's PeakFilter parameters instance """ # Get the list of unique m/z values from 'data' mzList = data[parameters['mzCol']].unique().tolist() # Set the target adducts if (parameters['polarity'] == 'Positive'): targetAdducts = ( "M+H,M+H-H2O,M+2H,M+3H,M+4H,M+NH4,M+Ag,M+Na,M+2Na,M+K," "M+2K,M+Li,M+2Li") else: targetAdducts = 'M-H,M-CH3,M-2H,M-3H,M-4H,M.F,M.HF2,M.Cl,M.OAc,M.HCOO' # Get the number of matches in batches to balance the number of # requests and the amount of information requested numTargetHits = 0 numDecoyHits = 0 for start in range(0, len(mzList), BATCH_SIZE): mzBatch = mzList[start:start + BATCH_SIZE] # Get a string with one m/z per line (text file alike) mzStr = os.linesep.join(map(str, mzBatch)) numTargetHits += _get_num_matches('COMP_DB', mzStr, targetAdducts) numDecoyHits += _get_num_matches('COMP_DB_5', mzStr, targetAdducts) # Raise an exception if there are no matches in the target database if (numTargetHits == 0): raise ValueError(("No matches found in the target database. The FDR " "cannot be computed.")) # FDR = numDecoyHits / numTargetHits return float(numDecoyHits) / numTargetHits
def _detect_sample_isotopes(array, parameters): """Return an array with the tagged parents and their corresponding isotopes in the same order as in the given sample array. Keyword Arguments: array -- array with m/z, retention time (RT), sample's intensity mean and index of the original dataframe parameters -- LipidFinder's PeakFilter parameters instance """ # Get the corresponding symbol for the polarity of the data (+ or -) polSign = '+' if (parameters['polarity'].lower() == 'positive') else '-' # Create an array of empty strings that will contain the tagged # parents and their corresponding isotopes tagArray = numpy.full(len(array), '', dtype=object) # Loop over each m/z to search for isotopes isotopesIndex = set() for index in range(0, len(array)): # Skip if frame has already been identified as an isotope if (array[index, 3] in isotopesIndex): continue for isoPeak in range(1, parameters['numIsotopes'] + 1): parentMZ = array[index, 0] tagID = int(array[index, 3]) # Get the first and last indexes of the frames that are # within the first isotope m/z range for the current analyte isotopeMZ = parentMZ + ISO_OFFSET * isoPeak minMZ, maxMZ = mz_tol_range(isotopeMZ, parameters['mzFixedError'], parameters['mzPPMError']) mzMatches = numpy.searchsorted(array[:, 0], [minMZ, maxMZ]) if (mzMatches[0] == mzMatches[1]): # Have not found any analyte with an isotope-like m/z if (isoPeak == 1): # The first isotope must exists to search for others break else: continue # Filter m/z matches with the same RT as the parent parentRT = array[index, 1] minRT, maxRT = rt_tol_range(parentRT, parameters['maxRTDiffAdjFrame']) rtMatches = numpy.where( (array[mzMatches[0] : mzMatches[1], 1] >= minRT) & (array[mzMatches[0] : mzMatches[1], 1] <= maxRT))[0] if (len(rtMatches) == 0): # No candidates are within the same RT if (isoPeak == 1): # The first isotope must exists to search for others break else: continue # Resultant indexes are based on the previous search rtMatches += mzMatches[0] # Filter the candidate isotopes by intensity parentInten = array[index, 2] # The intensity range coefficients vary depending on the # isotope number if (isoPeak == 1): # Get an estimated maximum number of C in the molecule numC = round(parentMZ / 12) # Calculate isotopic distribution based on polynomial # expansion baseIntensity = parentInten * (numC ** 1.3) * 0.002 minIntensity = baseIntensity * parameters['isoIntensityCoef'][0] maxIntensity = baseIntensity * parameters['isoIntensityCoef'][1] elif (isoPeak == 2): # Get an estimated maximum number of C in the molecule numC = round(parentMZ / 12) # Calculate isotopic distribution based on polynomial # expansion baseIntensity = parentInten * (numC ** 1.7) * 0.0001 minIntensity = baseIntensity * parameters['isoIntensityCoef'][0] maxIntensity = baseIntensity * parameters['isoIntensityCoef'][1] else: # Calculate isotopic distribution with the same formula # as CAMERA (from XCMS) minIntensity = parentInten * float('1e-{0}'.format(isoPeak + 2)) maxIntensity = parentInten * 2 isotopes = numpy.where((array[rtMatches, 2] >= minIntensity) & (array[rtMatches, 2] <= maxIntensity))[0] if (len(isotopes) == 0): # No candidates have an intensity within expected range if (isoPeak == 1): # The first isotope must exists to search for others break else: continue # Resultant indexes are based on the previous search isotopes += rtMatches[0] # Tag the analyte as isotope and save its index to avoid # checking it as parent of other analytes tagArray[isotopes] = '[{0}][M+{1}]{2}'.format(tagID, isoPeak, polSign) isotopesIndex.update(array[isotopes, 3]) else: # Tag the analyte as parent tagArray[index] = '[{0}][M]{1}'.format(tagID, polSign) return tagArray
def __process_sample__(intensity, rtDiff, parameters, repsPerGroup): # type: (numpy.ndarray, numpy.ndarray, LFParameters, int) -> None """Correct retention time misalignment in the given sample. Keyword Arguments: intensity -- intensity per frame and sample's replicate rtDiff -- time differences between consecutive frames parameters -- LipidFinder's PeakFilter parameters instance repsPerGroup -- number of replicates per sample """ while True: # Copy 'intensity' array to check later if it has been modified oldIntensity = numpy.copy(intensity) # Number of frames and replicates in the given feature cluster numRows, numCols = intensity.shape for rep in range(0, numCols): for row in range(0, numRows): if (intensity[row][rep] != 0): continue # Require at least half non-zero intensity values elif ((2 * numpy.count_nonzero(intensity[row])) >= repsPerGroup): # Adjacent frame (row -/+ 1) intensity values adjFrameValues = [0, 0] if ((row > 0) and (intensity[row - 1][rep] != 0) and (rtDiff[row - 1] < parameters['maxRTDiffAdjFrame'])): # The frame above has a non-zero intensity and # is within the allowed retention time (RT) # threshold adjFrameValues[0] = intensity[row - 1][rep] if ((row < (numRows - 1)) and (intensity[row + 1][rep] != 0) and (rtDiff[row] < parameters['maxRTDiffAdjFrame'])): # The frame below has a non-zero intensity and # is within the allowed RT threshold adjFrameValues[1] = intensity[row + 1][rep] if (any(adjFrameValues)): # Save the contiguous frame (if any) where to # swap the intensity values swapIndex = 0 # At least one contiguous intensity is greater # than zero. Get mean and standard deviation of # current frame (non-zero values). repMean = intensity[row][numpy.nonzero( intensity[row])[0]].mean() repStdDev = intensity[row][numpy.nonzero( intensity[row])[0]].std() # Calculate the maximum standard deviation stDev = parameters['intensityStDev'] * repStdDev if ((adjFrameValues[0] != 0) and (adjFrameValues[0] >= repMean - stDev) and (adjFrameValues[0] <= repMean + stDev)): if ((2 * numpy.count_nonzero(intensity[row - 1])) < repsPerGroup): swapIndex = -1 elif ((2 * numpy.count_nonzero(intensity[row - 1])) == repsPerGroup): prevFrameMean = intensity[row - 1][ numpy.nonzero(intensity[row - 1])[0] ].mean() if (repMean >= prevFrameMean): swapIndex = -1 if ((adjFrameValues[1] != 0) and (adjFrameValues[1] >= repMean - stDev) and (adjFrameValues[1] <= repMean + stDev)): # If 'swapIndex' is not 0, swap with the # closest intensity value to the mean of the # current frame if ((swapIndex == 0) or ((swapIndex != 0) and (abs(repMean - adjFrameValues[1]) < abs(repMean - adjFrameValues[0])))): nextNonZeroReps = numpy.count_nonzero( intensity[row + 1]) if ((2 * nextNonZeroReps) < repsPerGroup): swapIndex = 1 elif ((2 * nextNonZeroReps) == repsPerGroup): nextFrameMean = intensity[row + 1][ numpy.nonzero(intensity[row + 1])[0] ].mean() if (repMean >= nextFrameMean): swapIndex = 1 if (swapIndex != 0): # Swap with the chosen contiguous frame intensity[row][rep] = \ intensity[row + swapIndex][rep] intensity[row + swapIndex][rep] = 0 # Repeat the process until no more modifications are performed if (numpy.array_equal(intensity, oldIntensity)): break
def cluster_by_mz(data, parameters): # type: (LFDataFrame, LFParameters) -> None """Cluster m/z artifacts that differ from each other by a mass less than the defined tolerance. Hierarchical clustering is employed to group the ions into the most appropriate groups. Mass clusters are assigned an arbitrary unique integer identifier. Keyword Arguments: data -- LFDataFrame instance parameters -- LipidFinder's PeakFilter parameters instance """ firstRepIndex = parameters['firstSampleIndex'] - 1 mzCol = parameters['mzCol'] # Create a new dataframe with auxiliary information: # "mzDiffNextFrame": m/z difference between current and next frames # "mzClusterSectionID": cluster section ID given to each m/z auxData = pandas.DataFrame( {'mzDiffNextFrame': data[mzCol].shift(-1) - data[mzCol]}) auxData['mzClusterSectionID'] = numpy.nan # Calculate the cluster section ID for each m/z numRowsData = len(data) sectionBegin = 0 # Minimum amount of m/z that will belong to the same cluster section sectionMinSize = 49 clusterSectionID = 1 while ((numRowsData - sectionBegin) >= sectionMinSize): sectionEnd = sectionBegin + sectionMinSize while (sectionEnd < (numRowsData - 1)): # If the m/z difference to the next frame is greater than # the sum of the m/z delta of the largest mass in the # current group and the smallest mass in the next group, we # can close this cluster section and start a new one currentDelta = mz_delta(data.loc[sectionEnd, mzCol], parameters['mzFixedError'], parameters['mzPPMError']) nextDelta = mz_delta(data.loc[sectionEnd + 1, mzCol], parameters['mzFixedError'], parameters['mzPPMError']) if (auxData.iloc[sectionEnd, 0] > (currentDelta + nextDelta)): break sectionEnd += 1 sectionEnd += 1 auxData.iloc[sectionBegin : sectionEnd, 1] = clusterSectionID clusterSectionID += 1 sectionBegin = sectionEnd if (sectionBegin < numRowsData): # Group the remaining masses in another cluster section auxData.iloc[sectionBegin : numRowsData, 1] = clusterSectionID else: # The last cluster section ID was not used so get the total # number of IDs assigned clusterSectionID -= 1 # Add a column to dataframe where the mass cluster IDs will be saved data['mzClusterID'] = numpy.nan currentMaxClusterID = 0 for sectionID in range(1, clusterSectionID + 1): sectionRows = auxData.iloc[:, 1] == sectionID # Copy the masses in the current cluster into a list of single # item lists (one per mass) vectorMZ = data.loc[sectionRows, mzCol].values.reshape((-1, 1)) if (len(vectorMZ) == 1): # Give the next cluster ID to the item and move to next # cluster section currentMaxClusterID += 1 data.loc[sectionRows, 'mzClusterID'] = currentMaxClusterID else: # Perform hierarchical clustering: # Get maximum m/z error in current cluster (based on maximum # m/z). This will be the cut off for hierarchical clustering. maxMZ = data.loc[sectionRows, mzCol].max() currentMaxMZError = 2 * mz_delta(maxMZ, parameters['mzFixedError'], parameters['mzPPMError']) # Calculate distance between every mass in cluster section mzDistMatrix = distance.pdist(vectorMZ) # Calculate linkage mzLinkage = hierarchy.complete(mzDistMatrix) # Return a list of flat cluster IDs for each mass, shifting # the numbers by the last assigned cluster ID mzClusters = hierarchy.fcluster(mzLinkage, currentMaxMZError, 'distance') + currentMaxClusterID # Add this information to the dataframe data.loc[sectionRows, 'mzClusterID'] = mzClusters # Increment the current cluster ID by the number of unique # clusters in the current mass section currentMaxClusterID += len(set(mzClusters)) # Renumber Cluster IDs based on their appearance in the dataframe clusterIDs = data['mzClusterID'].values id = 1 numRowsData = len(data) for index in range(0, numRowsData - 1): clusterIDs[index] = id if (clusterIDs[index] != clusterIDs[index + 1]): id += 1 clusterIDs[numRowsData - 1] = id
def rm_neutral_loss_frags( array, # type: numpy.ndarray losses, # type: pandas.DataFrame parameters # type: LFParameters ): # type: (...) -> list[float] """Return an index list corresponding to the features in the given sample array that have been fragmented. Return the index list of all 'array' features that have lost one of the m/z in 'losses' and their complete counterpart is present in the data. The features to be removed must be higher than the given cut-off. All m/z and retention time (RT) matching are computed within tolerance. Keyword arguments: array -- array with m/z, RT and index of the original dataframe losses -- neutral losses to subtract in order to detect fragmented features parameters -- LipidFinder's PeakFilter parameters instance """ # Create an array with one m/z cut-off and neutral loss m/z per row fragsArray = numpy.stack((losses['MZCutOff'].values, losses['MZ'].values), axis=-1) # Create a dictionary with cut-off values as keys and their # corresponding neutral loss m/z in lists as values fragsDict = {} for mzCutOff, mzLoss in fragsArray: fragsDict.setdefault(mzCutOff, []).append(mzLoss) matchIndexSet = set() for mzCutOff in viewkeys(fragsDict): # Get the index of the first m/z value in 'array' greater than # the m/z cut-off firstIndex = numpy.searchsorted(array[:, 0], mzCutOff) for index in range(firstIndex, len(array)): for mzLoss in fragsDict[mzCutOff]: # Look for in-source fragments, that is, features that # are the result of subtracting the neutral loss to the # parent's m/z and elute at the same RT fragMZ = array[index, 0] - mzLoss mzRange = mz_tol_range(fragMZ, parameters['mzFixedError'], parameters['mzPPMError']) # Get first and last indexes of the features within the # m/z range mzMatches = numpy.searchsorted(array[:, 0], mzRange) if (mzMatches[0] == mzMatches[1]): continue # In order to be considered a match, each feature must # have the same RT minRT, maxRT = rt_tol_range(array[index, 1], RT_TOLERANCE) rtMatches = numpy.where( (array[mzMatches[0]:mzMatches[1], 1] >= minRT) & (array[mzMatches[0]:mzMatches[1], 1] <= maxRT))[0] if (len(rtMatches) == 0): continue # The resultant indexes are based on the starting index # of the search ('mzMatches[0]') rtMatches += mzMatches[0] # The union of sets will handle any index repetition matchIndexSet.update(set(rtMatches)) return list(matchIndexSet)