def _collect_stack(array, index, rtGap, stackMZ, parameters): # type: (numpy.ndarray, int, float, float, LFParameters) -> list """Get every feature that matches the stack m/z difference and retention time gap from the previous feature to shape the contaminant stack. Keyword Arguments: array -- array with every feature m/z, retention time (RT) and index index -- index of the previous feature in 'array' rtGap -- RT difference between consecutive features stackMZ -- contaminant m/z difference parameters -- LipidFinder's PeakFilter parameters instance """ nextMZ = array[index, 0] lastHitRT = array[index, 1] rtDiff = 0 stackList = [index] gapCount = 0 while (gapCount <= parameters['maxStackGap']): # Calculate the expected m/z and RT of next stack feature nextMZ += stackMZ minMZ, maxMZ = mz_tol_range(nextMZ, parameters['mzFixedError'], parameters['mzPPMError']) rtDiff += rtGap expectedRT = lastHitRT + rtDiff minRT, maxRT = rt_tol_range(expectedRT, parameters['maxRTDiffAdjFrame']) matches = numpy.where( (minMZ <= array[:, 0]) & (array[:, 0] <= maxMZ) & (minRT <= array[:, 1]) & (array[:, 1] <= maxRT))[0] if (len(matches) == 0): gapCount += 1 else: # Select the frame with the closest RT to the expected one if (len(matches) == 1): stackIndex = matches[0] else: stackIndex = matches[numpy.absolute( array[matches, 1] - expectedRT).argmin()] # Add the frame as member of the stack stackList.append(stackIndex) # Reset the information to calculate the next RT lastHitRT = array[stackIndex, 1] rtDiff = 0 # Reset the number of gaps gapCount = 0 return (stackList)
def rm_full_frags( array, # type: numpy.ndarray fragments, # type: pandas.DataFrame parameters # type: LFParameters ): # type: (...) -> list[float] """Return an index list corresponding to common in-source fragments in the given sample array. Return the index list of all 'array' features that match the m/z values provided in 'fragments' for which there is at least another feature above the given m/z cut-off at the same retention time (RT). All m/z and RT matching are computed within tolerance. Keyword arguments: array -- array with m/z, RT and index of the original dataframe fragments -- in-source fragments to be removed parameters -- LipidFinder's PeakFilter parameters instance """ # Create an array with one in-source fragment m/z cut-off and m/z # offset per row fragsArray = numpy.stack( (fragments['MZ'].values, fragments['MZCutOff'].values), axis=-1) fragsIndex = [] for fragMZ, mzCutOff in fragsArray: mzRange = mz_tol_range(fragMZ, parameters['mzFixedError'], parameters['mzPPMError']) # Get the index of 'array' features that match the in-source # fragment m/z value mzMatches = numpy.searchsorted(array[:, 0], mzRange) if (mzMatches[0] == mzMatches[1]): continue for index in range(mzMatches[0], mzMatches[1]): # To be a match, each feature must have the same RT minRT, maxRT = rt_tol_range(array[index, 1], RT_TOLERANCE) rtMatches = numpy.where((array[:, 0] >= mzCutOff) & (array[:, 1] >= minRT) & (array[:, 1] <= maxRT))[0] if (len(rtMatches) > 0): # Mark the feature as an in-source fragment fragsIndex.append(index) return fragsIndex
def amalgamate_data(negData, posData, parameters, dst=''): # type: (object, object, LFParameters, str) -> None """Amalgamate negative and positive ion polarity dataframes. 'negData' and 'posData' have to match the same column layout as the output files from LipidFinder's PeakFilter module. For those frames with matching m/z and retention time, the one with the lowest total intensity mean is discarded. Both files must have the same column headings. If 'dst' is not an absolute path, the current working directory will be used as starting point. If "amalgamated.csv" file already exists, it will be overwritten. Keyword Arguments: negData -- negative polarity LFDataFrame or pandas.DataFrame instance posData -- positive polarity LFDataFrame or pandas.DataFrame instance parameters -- LipidFinder's Amalgamator parameters instance dst -- destination directory where the log file and the amalgamated data CSV file will be saved [default: current working directory] """ # Set the log file where the information about the steps performed # is saved logFilePath = 'amalgamator.log' if (dst): logFilePath = os.path.join(dst, logFilePath) # Create logger and its file handler logger = logging.getLogger() logger.setLevel(logging.INFO) handler = logging.FileHandler(logFilePath) handler.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) # Write initial information in log file logger.info(("Starting Amalgamator. Negative dataframe has %d rows and " "Positive dataframe has %d rows."), len(negData.index), len(posData.index)) mzCol = parameters['mzCol'] rtCol = parameters['rtCol'] # Check if columns in both dataframes are the same if (set(negData.columns) != set(posData.columns)): diffCols = set(negData.columns).symmetric_difference(posData.columns) raise IOError(("Input dataframes do not share the same column names: " "{0}").format(', '.join(diffCols))) # Check for misspelling errors in m/z or retention time column names if ((mzCol not in negData.columns) or (rtCol not in negData.columns)): raise KeyError("Missing '{0}' or '{1}' column(s)".format(mzCol, rtCol)) # Get the indices for intensity columns firstIndex = parameters['firstSampleIndex'] - 1 lastIndex = firstIndex + parameters['numSamples'] # Calculate the mean of every non-zero value of the mean columns of # each input dataframe and round it to the nearest integer. Replace # any NaN output from mean() by zero. totalMean = lambda x: numpy.rint( numpy.nan_to_num(x[numpy.where(x > 0)[0]].mean())).astype(int) negData['TotalMean'] = negData.iloc[:, firstIndex:lastIndex].apply(totalMean, axis=1) posData['TotalMean'] = posData.iloc[:, firstIndex:lastIndex].apply(totalMean, axis=1) nind = negData.index.values nmz = negData[mzCol].values nrt = negData[rtCol].values nmeans = negData['TotalMean'].values negCol = list(negData.columns.values) posCol = list(posData.columns.values) # Empty results dataframe results = pandas.DataFrame(columns=negCol) polColIndex = results.columns.get_loc('Polarity') # Start progress bar progress = 0 total = len(nind) + 1 print_progress_bar(progress, total, prefix='Amalgamator progress:') # Loop through indices in negative file for i in nind: # Update progress bar progress += 1 print_progress_bar(progress, total, prefix='Amalgamator progress:') negMass = nmz[i] negRT = nrt[i] pmz = posData[mzCol].values prt = posData[rtCol].values pmeans = posData['TotalMean'].values negMassH2 = negMass + HYDROGEN mzRange = mz_tol_range(negMassH2, parameters['mzFixedError'], parameters['mzPPMError']) rtRange = rt_tol_range(negRT, parameters['maxRTDiffAdjFrame']) matchesH2 = list( numpy.where((pmz >= mzRange[0]) & (pmz <= mzRange[1]) & (prt >= rtRange[0]) & (prt <= rtRange[1]))[0]) # First, look for H2 matches if (matchesH2): indMatch = __bestMatch__(matchesH2, negMassH2, pmz, negRT, prt, parameters) # Keep the frame with the highest total mean if (pmeans[indMatch] > nmeans[i]): results = results.append(posData.iloc[indMatch], ignore_index=True) if (parameters['combineIntensities']): results.iloc[-1, firstIndex : lastIndex] = \ results.iloc[-1, firstIndex : lastIndex] \ + negData.iloc[i, firstIndex : lastIndex] results.iloc[-1, polColIndex] += ' (Combined)' else: results.iloc[-1, polColIndex] += ' (Both)' else: results = results.append(negData.iloc[i], ignore_index=True) if (parameters['combineIntensities']): results.iloc[-1, firstIndex : lastIndex] = \ results.iloc[-1, firstIndex : lastIndex] \ + posData.iloc[indMatch, firstIndex : lastIndex] results.iloc[-1, polColIndex] += ' (Combined)' else: results.iloc[-1, polColIndex] += ' (Both)' logger.info('Match found: Negative ID %d - Positive ID %d.', negData.iloc[i, 0], posData.iloc[indMatch, 0]) # Remove match from positive dataframe, avoiding writing # the action to the log file if (isinstance(posData, LFDataFrame)): super(LFDataFrame, posData).drop(indMatch, inplace=True) else: posData.drop(indMatch, inplace=True) posData.reset_index(inplace=True, drop=True) pmz = posData[mzCol].values prt = posData[rtCol].values pmeans = posData['TotalMean'].values continue # If there are no H2 matches, look for CH4 matches negMassCH4 = negMass + METHANE mzRange = mz_tol_range(negMassCH4, parameters['mzFixedError'], parameters['mzPPMError']) matchesHCH3 = list( numpy.where((pmz >= mzRange[0]) & (pmz <= mzRange[1]) & (prt >= rtRange[0]) & (prt <= rtRange[1]))[0]) if (matchesHCH3): indMatch = __bestMatch__(matchesHCH3, negMassCH4, pmz, negRT, prt, parameters) # Keep the frame with the highest total mean if (pmeans[indMatch] > nmeans[i]): results = results.append(posData.iloc[indMatch], ignore_index=True) if (parameters['combineIntensities']): results.iloc[-1, firstIndex : lastIndex] = \ results.iloc[-1, firstIndex : lastIndex] \ + negData.iloc[i, firstIndex : lastIndex] results.iloc[-1, polColIndex] += ' (Combined)' else: results.iloc[-1, polColIndex] += ' (Both)' else: results = results.append(negData.iloc[i], ignore_index=True) if (parameters['combineIntensities']): results.iloc[-1, firstIndex : lastIndex] = \ results.iloc[-1, firstIndex : lastIndex] \ + posData.iloc[indMatch, firstIndex : lastIndex] results.iloc[-1, polColIndex] += ' (Combined)' else: results.iloc[-1, polColIndex] += ' (Both)' logger.info('Match found: Negative ID %d - Positive ID %d.', negData.iloc[i, 0], posData.iloc[indMatch, 0]) # Remove match from positive dataframe, avoiding writing # the action to the log file if (isinstance(posData, LFDataFrame)): super(LFDataFrame, posData).drop(indMatch, inplace=True) else: posData.drop(indMatch, inplace=True) posData.reset_index(inplace=True, drop=True) pmz = posData[mzCol].values prt = posData[rtCol].values pmeans = posData['TotalMean'].values continue results = results.append(negData.iloc[i], ignore_index=True) # Append what remains in the positive dataframe (unmatched positive # m/z values) results = results.append(posData, ignore_index=True) if (pandas.__version__ < '0.23.0'): # Fix unexpected column sorting from append() in pandas v0.20.3 # or newer (solved in v0.23.0 with argument "sort=False") results = results.reindex(negCol, axis=1) results.drop('TotalMean', axis=1, inplace=True) # Sort results by m/z and retention time and create the CSV file results.sort_values([mzCol, rtCol], inplace=True, kind='mergesort') results.to_csv(os.path.join(dst, 'amalgamated.csv'), index=False) # Update progress bar print_progress_bar(total, total, prefix='Amalgamator progress:') # Write the final information in log file and remove handler logger.info('Amalgamator completed. Output dataframe has %d rows.\n', len(results.index)) handler.close() logger.removeHandler(handler)
def _detect_sample_isotopes(array, parameters): """Return an array with the tagged parents and their corresponding isotopes in the same order as in the given sample array. Keyword Arguments: array -- array with m/z, retention time (RT), sample's intensity mean and index of the original dataframe parameters -- LipidFinder's PeakFilter parameters instance """ # Get the corresponding symbol for the polarity of the data (+ or -) polSign = '+' if (parameters['polarity'].lower() == 'positive') else '-' # Create an array of empty strings that will contain the tagged # parents and their corresponding isotopes tagArray = numpy.full(len(array), '', dtype=object) # Loop over each m/z to search for isotopes isotopesIndex = set() for index in range(0, len(array)): # Skip if frame has already been identified as an isotope if (array[index, 3] in isotopesIndex): continue for isoPeak in range(1, parameters['numIsotopes'] + 1): parentMZ = array[index, 0] tagID = int(array[index, 3]) # Get the first and last indexes of the frames that are # within the first isotope m/z range for the current analyte isotopeMZ = parentMZ + ISO_OFFSET * isoPeak minMZ, maxMZ = mz_tol_range(isotopeMZ, parameters['mzFixedError'], parameters['mzPPMError']) mzMatches = numpy.searchsorted(array[:, 0], [minMZ, maxMZ]) if (mzMatches[0] == mzMatches[1]): # Have not found any analyte with an isotope-like m/z if (isoPeak == 1): # The first isotope must exists to search for others break else: continue # Filter m/z matches with the same RT as the parent parentRT = array[index, 1] minRT, maxRT = rt_tol_range(parentRT, parameters['maxRTDiffAdjFrame']) rtMatches = numpy.where( (array[mzMatches[0] : mzMatches[1], 1] >= minRT) & (array[mzMatches[0] : mzMatches[1], 1] <= maxRT))[0] if (len(rtMatches) == 0): # No candidates are within the same RT if (isoPeak == 1): # The first isotope must exists to search for others break else: continue # Resultant indexes are based on the previous search rtMatches += mzMatches[0] # Filter the candidate isotopes by intensity parentInten = array[index, 2] # The intensity range coefficients vary depending on the # isotope number if (isoPeak == 1): # Get an estimated maximum number of C in the molecule numC = round(parentMZ / 12) # Calculate isotopic distribution based on polynomial # expansion baseIntensity = parentInten * (numC ** 1.3) * 0.002 minIntensity = baseIntensity * parameters['isoIntensityCoef'][0] maxIntensity = baseIntensity * parameters['isoIntensityCoef'][1] elif (isoPeak == 2): # Get an estimated maximum number of C in the molecule numC = round(parentMZ / 12) # Calculate isotopic distribution based on polynomial # expansion baseIntensity = parentInten * (numC ** 1.7) * 0.0001 minIntensity = baseIntensity * parameters['isoIntensityCoef'][0] maxIntensity = baseIntensity * parameters['isoIntensityCoef'][1] else: # Calculate isotopic distribution with the same formula # as CAMERA (from XCMS) minIntensity = parentInten * float('1e-{0}'.format(isoPeak + 2)) maxIntensity = parentInten * 2 isotopes = numpy.where((array[rtMatches, 2] >= minIntensity) & (array[rtMatches, 2] <= maxIntensity))[0] if (len(isotopes) == 0): # No candidates have an intensity within expected range if (isoPeak == 1): # The first isotope must exists to search for others break else: continue # Resultant indexes are based on the previous search isotopes += rtMatches[0] # Tag the analyte as isotope and save its index to avoid # checking it as parent of other analytes tagArray[isotopes] = '[{0}][M+{1}]{2}'.format(tagID, isoPeak, polSign) isotopesIndex.update(array[isotopes, 3]) else: # Tag the analyte as parent tagArray[index] = '[{0}][M]{1}'.format(tagID, polSign) return tagArray
def remove_stacks(data, parameters): # type: (LFDataFrame, LFParameters) -> None """Detect lipid and contaminant stacks and delete all ions present (in lipid stacks the parent is retained). A stack is a series of ions differing in m/z by a user-defined fixed mass shift. Lipid stacks elute at same retention time (RT) whilst contaminant stacks increase their RT as overall m/z increases. Firstly, the m/z is checked for a lipid stack and if a stack is present, all ions except the parent are deleted and the next m/z is checked. If no lipid stack is found then the m/z is checked for contaminant stacks. If found, the whole stack including the parent is removed. The list of lipid and contaminant stack mass differences is imported from the stacks CSV file. Keyword Arguments: data -- LFDataFrame instance parameters -- LipidFinder's PeakFilter parameters instance """ mzCol = parameters['mzCol'] rtCol = parameters['rtCol'] firstSample = parameters['firstSampleIndex'] - 1 lastSample = firstSample \ + (parameters['numSamples'] * parameters['numTechReps']) # Read the CSV file with the stacks information stacks = pandas.read_csv(parameters['stacksCSVPath']) # Separate lipid and contaminant stacks lipidStacksMZ = stacks.loc[stacks['Category'] == 'Lipid', 'MZ'].values contStacksMZ = stacks.loc[stacks['Category'] == 'Contaminant', 'MZ'].values # Build an array with m/z, RT and index to track the frames removed # as part of a stack array = numpy.stack((data[mzCol].values, data[rtCol].values, data.index.values), axis=-1) # Start the loop to find every stack in the dataset parentIndex = 0 toRemove = {'lipid': [], 'contam': []} while (parentIndex < (len(array) - 1)): parentMZ, parentRT = array[parentIndex, 0:2] # Lipid stack removal where m/z and RT have to be an exact match # (within tolerance) minRT, maxRT = rt_tol_range(parentRT, parameters['maxRTDiffAdjFrame']) # Stacks can be of only one type: if a lipid stack is found the # parent m/z will not be analysed as part of a contaminant stack for stackMZ in lipidStacksMZ: stackDiff = 0 gapCount = 0 stackList = [] while (gapCount <= parameters['maxStackGap']): # Calculate the expected m/z of the next feature stackDiff += stackMZ minMZ, maxMZ = mz_tol_range(parentMZ + stackDiff, parameters['mzFixedError'], parameters['mzPPMError']) matches = numpy.where( (minMZ <= array[:, 0]) & (array[:, 0] <= maxMZ) & (minRT <= array[:, 1]) & (array[:, 1] <= maxRT))[0] if (len(matches) == 0): gapCount += 1 else: # Select the feature with the closest RT to parent if (len(matches) == 1): stackIndex = matches[0] else: stackIndex = matches[numpy.absolute( array[matches, 1] - parentRT).argmin()] # Add the frame as member of the stack stackList.append(stackIndex) if (len(stackList) >= MIN_LIPID_STACK): # Mark frames to be removed as part of the lipid stack indexes = array[stackList, 2].tolist() toRemove['lipid'].extend(indexes) # Remove frames in stack from array array = numpy.delete(array, stackList, axis=0) if (parameters['lipidStackAddition']): # Add stack intensities to parent frame data.iloc[parentIndex, firstSample : lastSample] += \ data.iloc[indexes, firstSample : lastSample].sum( axis=0) break else: # No lipid stacks where found: search for contamination # stacks where m/z has to be an exact match (within # tolerance) and the RT between every two consecutive # features is the same (in accordance with the m/z # difference) for stackMZ in contStacksMZ: stackDiff = 0 stackList = [] # The RT gap is unknown until the stack has two # features, so calculate the minimum RT of next feature minRT = rt_tol_range(parentRT, parameters['maxRTDiffAdjFrame'])[1] # Get every possible stack within the maximum gap # distance and keep the largest one for gapCount in range(0, parameters['maxStackGap']): # Calculate the expected m/z of next stack feature stackDiff += stackMZ minMZ, maxMZ = mz_tol_range(parentMZ + stackDiff, parameters['mzFixedError'], parameters['mzPPMError']) matches = numpy.where( (minMZ <= array[:, 0]) & (array[:, 0] <= maxMZ) & (minRT < array[:, 1]))[0] # Explore every possible stack (different RT gap) # and keep the largest one for i in range(0, len(matches)): nextMZ, nextRT = array[matches[i], 0:2] rtGap = (nextRT - parentRT) / (gapCount + 1) matchStackList = _collect_stack( array, matches[i], rtGap, stackMZ, parameters) if (len(matchStackList) > len(stackList)): stackList = matchStackList if ((len(stackList) + 1) >= MIN_CONTAM_STACK): # Mark frames to be removed as part of the # contaminant stack (parent included) stackList.append(parentIndex) indexes = array[stackList, 2].tolist() toRemove['contam'].extend(indexes) # Remove frames in stack from array array = numpy.delete(array, stackList, axis=0) # Adjust index after parent has been removed parentIndex -= 1 break parentIndex += 1 # Remove lipid and/or contaminant stack features if (toRemove['lipid'] or toRemove['contam']): data.drop('Stacks removal (lipid)', labels=toRemove['lipid'], inplace=True) data.drop('Stacks removal (contaminant)', labels=toRemove['contam'], inplace=True) # Reset the index of dataframe after the update data.reset_index(inplace=True, drop=True)
def __rep_adduct_removal__(replicate, # pandas.Series adductsPairs, # list adducts, # pandas.DataFrame mzArray, # numpy.array rtArray, # numpy.array parameters # LFParameters ): # type: (...) -> pandas.Series """Detect pairs of adducts in the given sample replicate and set to zero the lowest intensity of each pair. The m/z and retention time (RT) matches are done within a tolerance. Keyword Arguments: replicate -- replicate's intensities pairs -- list of paired adducts adducts -- adducts information mzArray -- sample replicate's m/z values rtArray -- sample replicate's rt values parameters -- LipidFinder's PeakFilter parameters instance """ # Get the index of all intensities that are not zero nonZeroIndices = replicate.values.nonzero()[0] # Create an array of intensities that are not zero, and the arrays # with their corresponding m/z and RT values nzIntensities = numpy.copy(replicate.values[nonZeroIndices]) nzMZ = numpy.copy(mzArray[nonZeroIndices]) nzRT = numpy.copy(rtArray[nonZeroIndices]) # Create an array to hold a reference to the first adduct found # (default: "") adductTags = numpy.empty_like(nonZeroIndices, dtype=str) adductTags.fill('') for pair in adductsPairs: # Get the adducts information of the pair to create the lambda # function to calculate the offset of the given mass pairInfo = adducts.loc[adducts.iloc[:, 0].isin(pair)] # abs() function makes the pairs order insensitive get_offset = lambda x: \ abs(x - (pairInfo.iloc[1, 1] * (x - pairInfo.iloc[0, 2]) / pairInfo.iloc[0, 1] + pairInfo.iloc[1, 2])) index = 0 while (index < (nonZeroIndices.size - 1)): # Get first adduct tag of the source index tag = adductTags[index] # If source frame has been previously tagged and tag is not # equal to lower mass species then this frame cannot be # considered further so consider next frame if (tag and (tag != pair[0])): index += 1 continue # Get the m/z and RT of the current frame mz = nzMZ[index] rt = nzRT[index] # The adduct mass for the source mass with current pair adductMZ = mz + get_offset(mz) # The limits of a mass that could be an adduct of the source # mass, including the error tolerance minAdductMZ, maxAdductMZ = mz_tol_range( adductMZ, parameters['mzFixedError'], parameters['mzPPMError']) # Get the tolerance range for the RT minRT, maxRT = rt_tol_range(rt, parameters['maxRTDiffAdjFrame']) # Create a dataframe of potential adducts by m/z and RT potentialAdducts = numpy.where( (nzMZ >= minAdductMZ) & (nzMZ <= maxAdductMZ) & (nzRT >= minRT) & (nzRT <= maxRT))[0] if (potentialAdducts.size > 0): # Get the index of the adduct with the closest RT to the # subject RT adductIndex = potentialAdducts[numpy.absolute( nzRT[potentialAdducts] - rt).argmin()] # If the intensity of the adduct is greater than the # intensity of the source, set the latter to 0. # Otherwise, set the former to 0. if (nzIntensities[adductIndex] > nzIntensities[index]): # Go ahead if the adduct has not been tagged yet if (not adductTags[adductIndex]): # Record adduct species adductTags[adductIndex] = pairInfo.iloc[1, 0] if (parameters['adductAddition']): replicate.values[nonZeroIndices[adductIndex]] += \ nzIntensities[index] # Set source intensity to 0 replicate.values[nonZeroIndices[index]] = 0 # Remove source from each array nonZeroIndices = numpy.delete(nonZeroIndices, index) nzIntensities = numpy.delete(nzIntensities, index) nzMZ = numpy.delete(nzMZ, index) nzRT = numpy.delete(nzRT, index) adductTags = numpy.delete(adductTags, index) # 'index' will now point to the next frame continue else: if (not tag): # Record adduct species adductTags[index] = pairInfo.iloc[0, 0] if (parameters['adductAddition']): replicate.values[nonZeroIndices[index]] += \ nzIntensities[adductIndex] # Set adduct intensity to 0 replicate.values[nonZeroIndices[adductIndex]] = 0 # Remove adduct from each array nonZeroIndices = numpy.delete(nonZeroIndices, adductIndex) nzIntensities = numpy.delete(nzIntensities, adductIndex) nzMZ = numpy.delete(nzMZ, adductIndex) nzRT = numpy.delete(nzRT, adductIndex) adductTags = numpy.delete(adductTags, adductIndex) index += 1 return replicate
def rm_neutral_loss_frags( array, # type: numpy.ndarray losses, # type: pandas.DataFrame parameters # type: LFParameters ): # type: (...) -> list[float] """Return an index list corresponding to the features in the given sample array that have been fragmented. Return the index list of all 'array' features that have lost one of the m/z in 'losses' and their complete counterpart is present in the data. The features to be removed must be higher than the given cut-off. All m/z and retention time (RT) matching are computed within tolerance. Keyword arguments: array -- array with m/z, RT and index of the original dataframe losses -- neutral losses to subtract in order to detect fragmented features parameters -- LipidFinder's PeakFilter parameters instance """ # Create an array with one m/z cut-off and neutral loss m/z per row fragsArray = numpy.stack((losses['MZCutOff'].values, losses['MZ'].values), axis=-1) # Create a dictionary with cut-off values as keys and their # corresponding neutral loss m/z in lists as values fragsDict = {} for mzCutOff, mzLoss in fragsArray: fragsDict.setdefault(mzCutOff, []).append(mzLoss) matchIndexSet = set() for mzCutOff in viewkeys(fragsDict): # Get the index of the first m/z value in 'array' greater than # the m/z cut-off firstIndex = numpy.searchsorted(array[:, 0], mzCutOff) for index in range(firstIndex, len(array)): for mzLoss in fragsDict[mzCutOff]: # Look for in-source fragments, that is, features that # are the result of subtracting the neutral loss to the # parent's m/z and elute at the same RT fragMZ = array[index, 0] - mzLoss mzRange = mz_tol_range(fragMZ, parameters['mzFixedError'], parameters['mzPPMError']) # Get first and last indexes of the features within the # m/z range mzMatches = numpy.searchsorted(array[:, 0], mzRange) if (mzMatches[0] == mzMatches[1]): continue # In order to be considered a match, each feature must # have the same RT minRT, maxRT = rt_tol_range(array[index, 1], RT_TOLERANCE) rtMatches = numpy.where( (array[mzMatches[0]:mzMatches[1], 1] >= minRT) & (array[mzMatches[0]:mzMatches[1], 1] <= maxRT))[0] if (len(rtMatches) == 0): continue # The resultant indexes are based on the starting index # of the search ('mzMatches[0]') rtMatches += mzMatches[0] # The union of sets will handle any index repetition matchIndexSet.update(set(rtMatches)) return list(matchIndexSet)