def remove_contaminants(data, parameters):
    # type: (LFDataFrame, LFParameters) -> None
    """Remove straight m/z contaminants included in the contaminants CSV
    file from input data.

    Keyword Arguments:
        data       -- LFDataFrame instance
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    mzCol = parameters['mzCol']
    # Read the CSV file with the contaminants information
    if (parameters['polarity'] == 'Negative'):
        contaminants = pandas.read_csv(parameters['negContaminantsCSVPath'])
    else:
        contaminants = pandas.read_csv(parameters['posContaminantsCSVPath'])
    # Remove every frame that matches with a known contaminant
    toRemove = []
    for index, mz in contaminants['MZ'].iteritems():
        minMZ, maxMZ = mz_tol_range(mz, parameters['mzFixedError'],
                                    parameters['mzPPMError'])
        toRemove.extend(data[(minMZ <= data[mzCol])
                             & (data[mzCol] <= maxMZ)].index.tolist())
    if (toRemove):
        # Remove duplicates in the list to avoid errors
        toRemove = list(set(toRemove))
        data.drop('Contaminants removal', labels=toRemove, inplace=True)
        # Reset the index of dataframe after the update
        data.reset_index(inplace=True, drop=True)
def _collect_stack(array, index, rtGap, stackMZ, parameters):
    # type: (numpy.ndarray, int, float, float, LFParameters) -> list
    """Get every feature that matches the stack m/z difference and
    retention time gap from the previous feature to shape the
    contaminant stack.

    Keyword Arguments:
        array      -- array with every feature m/z, retention time (RT)
                      and index
        index      -- index of the previous feature in 'array'
        rtGap      -- RT difference between consecutive features
        stackMZ    -- contaminant m/z difference
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    nextMZ = array[index, 0]
    lastHitRT = array[index, 1]
    rtDiff = 0
    stackList = [index]
    gapCount = 0
    while (gapCount <= parameters['maxStackGap']):
        # Calculate the expected m/z and RT of next stack feature
        nextMZ += stackMZ
        minMZ, maxMZ = mz_tol_range(nextMZ,
                                    parameters['mzFixedError'],
                                    parameters['mzPPMError'])
        rtDiff += rtGap
        expectedRT = lastHitRT + rtDiff
        minRT, maxRT = rt_tol_range(expectedRT, parameters['maxRTDiffAdjFrame'])
        matches = numpy.where(
                (minMZ <= array[:, 0]) & (array[:, 0] <= maxMZ)
                & (minRT <= array[:, 1]) & (array[:, 1] <= maxRT))[0]
        if (len(matches) == 0):
            gapCount += 1
        else:
            # Select the frame with the closest RT to the expected one
            if (len(matches) == 1):
                stackIndex = matches[0]
            else:
                stackIndex = matches[numpy.absolute(
                        array[matches, 1] - expectedRT).argmin()]
            # Add the frame as member of the stack
            stackList.append(stackIndex)
            # Reset the information to calculate the next RT
            lastHitRT = array[stackIndex, 1]
            rtDiff = 0
            # Reset the number of gaps
            gapCount = 0
    return (stackList)
Esempio n. 3
0
def rm_full_frags(
        array,  # type: numpy.ndarray
        fragments,  # type: pandas.DataFrame
        parameters  # type: LFParameters
):
    # type: (...) -> list[float]
    """Return an index list corresponding to common in-source fragments
    in the given sample array.

    Return the index list of all 'array' features that match the m/z
    values provided in 'fragments' for which there is at least another
    feature above the given m/z cut-off at the same retention time (RT).
    All m/z and RT matching are computed within tolerance.

    Keyword arguments:
        array      -- array with m/z, RT and index of the original
                      dataframe
        fragments  -- in-source fragments to be removed
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    # Create an array with one in-source fragment m/z cut-off and m/z
    # offset per row
    fragsArray = numpy.stack(
        (fragments['MZ'].values, fragments['MZCutOff'].values), axis=-1)
    fragsIndex = []
    for fragMZ, mzCutOff in fragsArray:
        mzRange = mz_tol_range(fragMZ, parameters['mzFixedError'],
                               parameters['mzPPMError'])
        # Get the index of 'array' features that match the in-source
        # fragment m/z value
        mzMatches = numpy.searchsorted(array[:, 0], mzRange)
        if (mzMatches[0] == mzMatches[1]):
            continue
        for index in range(mzMatches[0], mzMatches[1]):
            # To be a match, each feature must have the same RT
            minRT, maxRT = rt_tol_range(array[index, 1], RT_TOLERANCE)
            rtMatches = numpy.where((array[:, 0] >= mzCutOff)
                                    & (array[:, 1] >= minRT)
                                    & (array[:, 1] <= maxRT))[0]
            if (len(rtMatches) > 0):
                # Mark the feature as an in-source fragment
                fragsIndex.append(index)
    return fragsIndex
Esempio n. 4
0
def amalgamate_data(negData, posData, parameters, dst=''):
    # type: (object, object, LFParameters, str) -> None
    """Amalgamate negative and positive ion polarity dataframes.

    'negData' and 'posData' have to match the same column layout as the
    output files from LipidFinder's PeakFilter module.
    For those frames with matching m/z and retention time, the one with
    the lowest total intensity mean is discarded. Both files must have
    the same column headings. If 'dst' is not an absolute path, the
    current working directory will be used as starting point. If
    "amalgamated.csv" file already exists, it will be overwritten.

    Keyword Arguments:
        negData    -- negative polarity LFDataFrame or pandas.DataFrame
                      instance
        posData    -- positive polarity LFDataFrame or pandas.DataFrame
                      instance
        parameters -- LipidFinder's Amalgamator parameters instance
        dst        -- destination directory where the log file and the
                      amalgamated data CSV file will be saved
                      [default: current working directory]
    """
    # Set the log file where the information about the steps performed
    # is saved
    logFilePath = 'amalgamator.log'
    if (dst):
        logFilePath = os.path.join(dst, logFilePath)
    # Create logger and its file handler
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler(logFilePath)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    # Write initial information in log file
    logger.info(("Starting Amalgamator. Negative dataframe has %d rows and "
                 "Positive dataframe has %d rows."), len(negData.index),
                len(posData.index))
    mzCol = parameters['mzCol']
    rtCol = parameters['rtCol']
    # Check if columns in both dataframes are the same
    if (set(negData.columns) != set(posData.columns)):
        diffCols = set(negData.columns).symmetric_difference(posData.columns)
        raise IOError(("Input dataframes do not share the same column names: "
                       "{0}").format(', '.join(diffCols)))
    # Check for misspelling errors in m/z or retention time column names
    if ((mzCol not in negData.columns) or (rtCol not in negData.columns)):
        raise KeyError("Missing '{0}' or '{1}' column(s)".format(mzCol, rtCol))
    # Get the indices for intensity columns
    firstIndex = parameters['firstSampleIndex'] - 1
    lastIndex = firstIndex + parameters['numSamples']
    # Calculate the mean of every non-zero value of the mean columns of
    # each input dataframe and round it to the nearest integer. Replace
    # any NaN output from mean() by zero.
    totalMean = lambda x: numpy.rint(
        numpy.nan_to_num(x[numpy.where(x > 0)[0]].mean())).astype(int)
    negData['TotalMean'] = negData.iloc[:,
                                        firstIndex:lastIndex].apply(totalMean,
                                                                    axis=1)
    posData['TotalMean'] = posData.iloc[:,
                                        firstIndex:lastIndex].apply(totalMean,
                                                                    axis=1)
    nind = negData.index.values
    nmz = negData[mzCol].values
    nrt = negData[rtCol].values
    nmeans = negData['TotalMean'].values
    negCol = list(negData.columns.values)
    posCol = list(posData.columns.values)
    # Empty results dataframe
    results = pandas.DataFrame(columns=negCol)
    polColIndex = results.columns.get_loc('Polarity')
    # Start progress bar
    progress = 0
    total = len(nind) + 1
    print_progress_bar(progress, total, prefix='Amalgamator progress:')
    # Loop through indices in negative file
    for i in nind:
        # Update progress bar
        progress += 1
        print_progress_bar(progress, total, prefix='Amalgamator progress:')
        negMass = nmz[i]
        negRT = nrt[i]
        pmz = posData[mzCol].values
        prt = posData[rtCol].values
        pmeans = posData['TotalMean'].values
        negMassH2 = negMass + HYDROGEN
        mzRange = mz_tol_range(negMassH2, parameters['mzFixedError'],
                               parameters['mzPPMError'])
        rtRange = rt_tol_range(negRT, parameters['maxRTDiffAdjFrame'])
        matchesH2 = list(
            numpy.where((pmz >= mzRange[0]) & (pmz <= mzRange[1])
                        & (prt >= rtRange[0])
                        & (prt <= rtRange[1]))[0])
        # First, look for H2 matches
        if (matchesH2):
            indMatch = __bestMatch__(matchesH2, negMassH2, pmz, negRT, prt,
                                     parameters)
            # Keep the frame with the highest total mean
            if (pmeans[indMatch] > nmeans[i]):
                results = results.append(posData.iloc[indMatch],
                                         ignore_index=True)
                if (parameters['combineIntensities']):
                    results.iloc[-1, firstIndex : lastIndex] = \
                            results.iloc[-1, firstIndex : lastIndex] \
                            + negData.iloc[i, firstIndex : lastIndex]
                    results.iloc[-1, polColIndex] += ' (Combined)'
                else:
                    results.iloc[-1, polColIndex] += ' (Both)'
            else:
                results = results.append(negData.iloc[i], ignore_index=True)
                if (parameters['combineIntensities']):
                    results.iloc[-1, firstIndex : lastIndex] = \
                            results.iloc[-1, firstIndex : lastIndex] \
                            + posData.iloc[indMatch, firstIndex : lastIndex]
                    results.iloc[-1, polColIndex] += ' (Combined)'
                else:
                    results.iloc[-1, polColIndex] += ' (Both)'
            logger.info('Match found: Negative ID %d - Positive ID %d.',
                        negData.iloc[i, 0], posData.iloc[indMatch, 0])
            # Remove match from positive dataframe, avoiding writing
            # the action to the log file
            if (isinstance(posData, LFDataFrame)):
                super(LFDataFrame, posData).drop(indMatch, inplace=True)
            else:
                posData.drop(indMatch, inplace=True)
            posData.reset_index(inplace=True, drop=True)
            pmz = posData[mzCol].values
            prt = posData[rtCol].values
            pmeans = posData['TotalMean'].values
            continue
        # If there are no H2 matches, look for CH4 matches
        negMassCH4 = negMass + METHANE
        mzRange = mz_tol_range(negMassCH4, parameters['mzFixedError'],
                               parameters['mzPPMError'])
        matchesHCH3 = list(
            numpy.where((pmz >= mzRange[0]) & (pmz <= mzRange[1])
                        & (prt >= rtRange[0])
                        & (prt <= rtRange[1]))[0])
        if (matchesHCH3):
            indMatch = __bestMatch__(matchesHCH3, negMassCH4, pmz, negRT, prt,
                                     parameters)
            # Keep the frame with the highest total mean
            if (pmeans[indMatch] > nmeans[i]):
                results = results.append(posData.iloc[indMatch],
                                         ignore_index=True)
                if (parameters['combineIntensities']):
                    results.iloc[-1, firstIndex : lastIndex] = \
                            results.iloc[-1, firstIndex : lastIndex] \
                            + negData.iloc[i, firstIndex : lastIndex]
                    results.iloc[-1, polColIndex] += ' (Combined)'
                else:
                    results.iloc[-1, polColIndex] += ' (Both)'
            else:
                results = results.append(negData.iloc[i], ignore_index=True)
                if (parameters['combineIntensities']):
                    results.iloc[-1, firstIndex : lastIndex] = \
                            results.iloc[-1, firstIndex : lastIndex] \
                            + posData.iloc[indMatch, firstIndex : lastIndex]
                    results.iloc[-1, polColIndex] += ' (Combined)'
                else:
                    results.iloc[-1, polColIndex] += ' (Both)'
            logger.info('Match found: Negative ID %d - Positive ID %d.',
                        negData.iloc[i, 0], posData.iloc[indMatch, 0])
            # Remove match from positive dataframe, avoiding writing
            # the action to the log file
            if (isinstance(posData, LFDataFrame)):
                super(LFDataFrame, posData).drop(indMatch, inplace=True)
            else:
                posData.drop(indMatch, inplace=True)
            posData.reset_index(inplace=True, drop=True)
            pmz = posData[mzCol].values
            prt = posData[rtCol].values
            pmeans = posData['TotalMean'].values
            continue
        results = results.append(negData.iloc[i], ignore_index=True)
    # Append what remains in the positive dataframe (unmatched positive
    # m/z values)
    results = results.append(posData, ignore_index=True)
    if (pandas.__version__ < '0.23.0'):
        # Fix unexpected column sorting from append() in pandas v0.20.3
        # or newer (solved in v0.23.0 with argument "sort=False")
        results = results.reindex(negCol, axis=1)
    results.drop('TotalMean', axis=1, inplace=True)
    # Sort results by m/z and retention time and create the CSV file
    results.sort_values([mzCol, rtCol], inplace=True, kind='mergesort')
    results.to_csv(os.path.join(dst, 'amalgamated.csv'), index=False)
    # Update progress bar
    print_progress_bar(total, total, prefix='Amalgamator progress:')
    # Write the final information in log file and remove handler
    logger.info('Amalgamator completed. Output dataframe has %d rows.\n',
                len(results.index))
    handler.close()
    logger.removeHandler(handler)
Esempio n. 5
0
def _detect_sample_isotopes(array, parameters):
    """Return an array with the tagged parents and their corresponding
    isotopes in the same order as in the given sample array.

    Keyword Arguments:
        array      -- array with m/z, retention time (RT), sample's
                      intensity mean and index of the original dataframe
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    # Get the corresponding symbol for the polarity of the data (+ or -)
    polSign = '+' if (parameters['polarity'].lower() == 'positive') else '-'
    # Create an array of empty strings that will contain the tagged
    # parents and their corresponding isotopes
    tagArray = numpy.full(len(array), '', dtype=object)
    # Loop over each m/z to search for isotopes
    isotopesIndex = set()
    for index in range(0, len(array)):
        # Skip if frame has already been identified as an isotope
        if (array[index, 3] in isotopesIndex):
            continue
        for isoPeak in range(1, parameters['numIsotopes'] + 1):
            parentMZ = array[index, 0]
            tagID = int(array[index, 3])
            # Get the first and last indexes of the frames that are
            # within the first isotope m/z range for the current analyte
            isotopeMZ = parentMZ + ISO_OFFSET * isoPeak
            minMZ, maxMZ = mz_tol_range(isotopeMZ, parameters['mzFixedError'],
                                        parameters['mzPPMError'])
            mzMatches = numpy.searchsorted(array[:, 0], [minMZ, maxMZ])
            if (mzMatches[0] == mzMatches[1]):
                # Have not found any analyte with an isotope-like m/z
                if (isoPeak == 1):
                    # The first isotope must exists to search for others
                    break
                else:
                    continue
            # Filter m/z matches with the same RT as the parent
            parentRT = array[index, 1]
            minRT, maxRT = rt_tol_range(parentRT,
                                        parameters['maxRTDiffAdjFrame'])
            rtMatches = numpy.where(
                    (array[mzMatches[0] : mzMatches[1], 1] >= minRT)
                    & (array[mzMatches[0] : mzMatches[1], 1] <= maxRT))[0]
            if (len(rtMatches) == 0):
                # No candidates are within the same RT
                if (isoPeak == 1):
                    # The first isotope must exists to search for others
                    break
                else:
                    continue
            # Resultant indexes are based on the previous search
            rtMatches += mzMatches[0]
            # Filter the candidate isotopes by intensity
            parentInten = array[index, 2]
            # The intensity range coefficients vary depending on the
            # isotope number
            if (isoPeak == 1):
                # Get an estimated maximum number of C in the molecule
                numC = round(parentMZ / 12)
                # Calculate isotopic distribution based on polynomial
                # expansion
                baseIntensity = parentInten * (numC ** 1.3) * 0.002
                minIntensity = baseIntensity * parameters['isoIntensityCoef'][0]
                maxIntensity = baseIntensity * parameters['isoIntensityCoef'][1]
            elif (isoPeak == 2):
                # Get an estimated maximum number of C in the molecule
                numC = round(parentMZ / 12)
                # Calculate isotopic distribution based on polynomial
                # expansion
                baseIntensity = parentInten * (numC ** 1.7) * 0.0001
                minIntensity = baseIntensity * parameters['isoIntensityCoef'][0]
                maxIntensity = baseIntensity * parameters['isoIntensityCoef'][1]
            else:
                # Calculate isotopic distribution with the same formula
                # as CAMERA (from XCMS)
                minIntensity = parentInten * float('1e-{0}'.format(isoPeak + 2))
                maxIntensity = parentInten * 2
            isotopes = numpy.where((array[rtMatches, 2] >= minIntensity)
                                   & (array[rtMatches, 2] <= maxIntensity))[0]
            if (len(isotopes) == 0):
                # No candidates have an intensity within expected range
                if (isoPeak == 1):
                    # The first isotope must exists to search for others
                    break
                else:
                    continue
            # Resultant indexes are based on the previous search
            isotopes += rtMatches[0]
            # Tag the analyte as isotope and save its index to avoid
            # checking it as parent of other analytes
            tagArray[isotopes] = '[{0}][M+{1}]{2}'.format(tagID, isoPeak,
                                                          polSign)
            isotopesIndex.update(array[isotopes, 3])
        else:
            # Tag the analyte as parent
            tagArray[index] = '[{0}][M]{1}'.format(tagID, polSign)
    return tagArray
def remove_stacks(data, parameters):
    # type: (LFDataFrame, LFParameters) -> None
    """Detect lipid and contaminant stacks and delete all ions present
    (in lipid stacks the parent is retained).

    A stack is a series of ions differing in m/z by a user-defined fixed
    mass shift. Lipid stacks elute at same retention time (RT) whilst
    contaminant stacks increase their RT as overall m/z increases.
    Firstly, the m/z is checked for a lipid stack and if a stack is
    present, all ions except the parent are deleted and the next m/z is
    checked. If no lipid stack is found then the m/z is checked for
    contaminant stacks. If found, the whole stack including the parent
    is removed. The list of lipid and contaminant stack mass differences
    is imported from the stacks CSV file.

    Keyword Arguments:
        data       -- LFDataFrame instance
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    mzCol = parameters['mzCol']
    rtCol = parameters['rtCol']
    firstSample = parameters['firstSampleIndex'] - 1
    lastSample = firstSample \
                 + (parameters['numSamples'] * parameters['numTechReps'])
    # Read the CSV file with the stacks information
    stacks = pandas.read_csv(parameters['stacksCSVPath'])
    # Separate lipid and contaminant stacks
    lipidStacksMZ = stacks.loc[stacks['Category'] == 'Lipid', 'MZ'].values
    contStacksMZ = stacks.loc[stacks['Category'] == 'Contaminant', 'MZ'].values
    # Build an array with m/z, RT and index to track the frames removed
    # as part of a stack
    array = numpy.stack((data[mzCol].values, data[rtCol].values,
                             data.index.values), axis=-1)
    # Start the loop to find every stack in the dataset
    parentIndex = 0
    toRemove = {'lipid': [], 'contam': []}
    while (parentIndex < (len(array) - 1)):
        parentMZ, parentRT = array[parentIndex, 0:2]
        # Lipid stack removal where m/z and RT have to be an exact match
        # (within tolerance)
        minRT, maxRT = rt_tol_range(parentRT, parameters['maxRTDiffAdjFrame'])
        # Stacks can be of only one type: if a lipid stack is found the
        # parent m/z will not be analysed as part of a contaminant stack
        for stackMZ in lipidStacksMZ:
            stackDiff = 0
            gapCount = 0
            stackList = []
            while (gapCount <= parameters['maxStackGap']):
                # Calculate the expected m/z of the next feature
                stackDiff += stackMZ
                minMZ, maxMZ = mz_tol_range(parentMZ + stackDiff,
                                            parameters['mzFixedError'],
                                            parameters['mzPPMError'])
                matches = numpy.where(
                        (minMZ <= array[:, 0]) & (array[:, 0] <= maxMZ)
                        & (minRT <= array[:, 1]) & (array[:, 1] <= maxRT))[0]
                if (len(matches) == 0):
                    gapCount += 1
                else:
                    # Select the feature with the closest RT to parent
                    if (len(matches) == 1):
                        stackIndex = matches[0]
                    else:
                        stackIndex = matches[numpy.absolute(
                                array[matches, 1] - parentRT).argmin()]
                    # Add the frame as member of the stack
                    stackList.append(stackIndex)
            if (len(stackList) >= MIN_LIPID_STACK):
                # Mark frames to be removed as part of the lipid stack
                indexes = array[stackList, 2].tolist()
                toRemove['lipid'].extend(indexes)
                # Remove frames in stack from array
                array = numpy.delete(array, stackList, axis=0)
                if (parameters['lipidStackAddition']):
                    # Add stack intensities to parent frame
                    data.iloc[parentIndex, firstSample : lastSample] += \
                            data.iloc[indexes, firstSample : lastSample].sum(
                                    axis=0)
                break
        else:
            # No lipid stacks where found: search for contamination
            # stacks where m/z has to be an exact match (within
            # tolerance) and the RT between every two consecutive
            # features is the same (in accordance with the m/z
            # difference)
            for stackMZ in contStacksMZ:
                stackDiff = 0
                stackList = []
                # The RT gap is unknown until the stack has two
                # features, so calculate the minimum RT of next feature
                minRT = rt_tol_range(parentRT,
                                     parameters['maxRTDiffAdjFrame'])[1]
                # Get every possible stack within the maximum gap
                # distance and keep the largest one
                for gapCount in range(0, parameters['maxStackGap']):
                    # Calculate the expected m/z of next stack feature
                    stackDiff += stackMZ
                    minMZ, maxMZ = mz_tol_range(parentMZ + stackDiff,
                                                parameters['mzFixedError'],
                                                parameters['mzPPMError'])
                    matches = numpy.where(
                            (minMZ <= array[:, 0]) & (array[:, 0] <= maxMZ)
                            & (minRT < array[:, 1]))[0]
                    # Explore every possible stack (different RT gap)
                    # and keep the largest one
                    for i in range(0, len(matches)):
                        nextMZ, nextRT = array[matches[i], 0:2]
                        rtGap = (nextRT - parentRT) / (gapCount + 1)
                        matchStackList = _collect_stack(
                                array, matches[i], rtGap, stackMZ, parameters)
                        if (len(matchStackList) > len(stackList)):
                            stackList = matchStackList
                if ((len(stackList) + 1) >= MIN_CONTAM_STACK):
                    # Mark frames to be removed as part of the
                    # contaminant stack (parent included)
                    stackList.append(parentIndex)
                    indexes = array[stackList, 2].tolist()
                    toRemove['contam'].extend(indexes)
                    # Remove frames in stack from array
                    array = numpy.delete(array, stackList, axis=0)
                    # Adjust index after parent has been removed
                    parentIndex -= 1
                    break
        parentIndex += 1
    # Remove lipid and/or contaminant stack features
    if (toRemove['lipid'] or toRemove['contam']):
        data.drop('Stacks removal (lipid)', labels=toRemove['lipid'],
                  inplace=True)
        data.drop('Stacks removal (contaminant)', labels=toRemove['contam'],
                  inplace=True)
        # Reset the index of dataframe after the update
        data.reset_index(inplace=True, drop=True)
def __rep_adduct_removal__(replicate,    # pandas.Series
                           adductsPairs, # list
                           adducts,      # pandas.DataFrame
                           mzArray,      # numpy.array
                           rtArray,      # numpy.array
                           parameters    # LFParameters
                           ):
    # type: (...) -> pandas.Series
    """Detect pairs of adducts in the given sample replicate and set to
    zero the lowest intensity of each pair.

    The m/z and retention time (RT) matches are done within a tolerance.

    Keyword Arguments:
        replicate    -- replicate's intensities
        pairs        -- list of paired adducts
        adducts      -- adducts information
        mzArray      -- sample replicate's m/z values
        rtArray      -- sample replicate's rt values
        parameters   -- LipidFinder's PeakFilter parameters instance
    """
    # Get the index of all intensities that are not zero
    nonZeroIndices = replicate.values.nonzero()[0]
    # Create an array of intensities that are not zero, and the arrays
    # with their corresponding m/z and RT values
    nzIntensities = numpy.copy(replicate.values[nonZeroIndices])
    nzMZ = numpy.copy(mzArray[nonZeroIndices])
    nzRT = numpy.copy(rtArray[nonZeroIndices])
    # Create an array to hold a reference to the first adduct found
    # (default: "")
    adductTags = numpy.empty_like(nonZeroIndices, dtype=str)
    adductTags.fill('')
    for pair in adductsPairs:
        # Get the adducts information of the pair to create the lambda
        # function to calculate the offset of the given mass
        pairInfo = adducts.loc[adducts.iloc[:, 0].isin(pair)]
        # abs() function makes the pairs order insensitive
        get_offset = lambda x: \
                abs(x - (pairInfo.iloc[1, 1] * (x - pairInfo.iloc[0, 2]) /
                         pairInfo.iloc[0, 1] + pairInfo.iloc[1, 2]))
        index = 0
        while (index < (nonZeroIndices.size - 1)):
            # Get first adduct tag of the source index
            tag = adductTags[index]
            # If source frame has been previously tagged and tag is not
            # equal to lower mass species then this frame cannot be
            # considered further so consider next frame
            if (tag and (tag != pair[0])):
                index += 1
                continue
            # Get the m/z and RT of the current frame
            mz = nzMZ[index]
            rt = nzRT[index]
            # The adduct mass for the source mass with current pair
            adductMZ = mz + get_offset(mz)
            # The limits of a mass that could be an adduct of the source
            # mass, including the error tolerance
            minAdductMZ, maxAdductMZ = mz_tol_range(
                    adductMZ, parameters['mzFixedError'],
                    parameters['mzPPMError'])
            # Get the tolerance range for the RT
            minRT, maxRT = rt_tol_range(rt, parameters['maxRTDiffAdjFrame'])
            # Create a dataframe of potential adducts by m/z and RT
            potentialAdducts = numpy.where(
                    (nzMZ >= minAdductMZ) & (nzMZ <= maxAdductMZ)
                    & (nzRT >= minRT) & (nzRT <= maxRT))[0]
            if (potentialAdducts.size > 0):
                # Get the index of the adduct with the closest RT to the
                # subject RT
                adductIndex = potentialAdducts[numpy.absolute(
                        nzRT[potentialAdducts] - rt).argmin()]
                # If the intensity of the adduct is greater than the
                # intensity of the source, set the latter to 0.
                # Otherwise, set the former to 0.
                if (nzIntensities[adductIndex] > nzIntensities[index]):
                    # Go ahead if the adduct has not been tagged yet
                    if (not adductTags[adductIndex]):
                        # Record adduct species
                        adductTags[adductIndex] = pairInfo.iloc[1, 0]
                        if (parameters['adductAddition']):
                            replicate.values[nonZeroIndices[adductIndex]] += \
                                    nzIntensities[index]
                        # Set source intensity to 0
                        replicate.values[nonZeroIndices[index]] = 0
                        # Remove source from each array
                        nonZeroIndices = numpy.delete(nonZeroIndices, index)
                        nzIntensities = numpy.delete(nzIntensities, index)
                        nzMZ = numpy.delete(nzMZ, index)
                        nzRT = numpy.delete(nzRT, index)
                        adductTags = numpy.delete(adductTags, index)
                        # 'index' will now point to the next frame
                        continue
                else:
                    if (not tag):
                        # Record adduct species
                        adductTags[index] = pairInfo.iloc[0, 0]
                    if (parameters['adductAddition']):
                        replicate.values[nonZeroIndices[index]] += \
                                nzIntensities[adductIndex]
                    # Set adduct intensity to 0
                    replicate.values[nonZeroIndices[adductIndex]] = 0
                    # Remove adduct from each array
                    nonZeroIndices = numpy.delete(nonZeroIndices, adductIndex)
                    nzIntensities = numpy.delete(nzIntensities, adductIndex)
                    nzMZ = numpy.delete(nzMZ, adductIndex)
                    nzRT = numpy.delete(nzRT, adductIndex)
                    adductTags = numpy.delete(adductTags, adductIndex)
            index += 1
    return replicate
Esempio n. 8
0
def rm_neutral_loss_frags(
        array,  # type: numpy.ndarray
        losses,  # type: pandas.DataFrame
        parameters  # type: LFParameters
):
    # type: (...) -> list[float]
    """Return an index list corresponding to the features in the given
    sample array that have been fragmented.

    Return the index list of all 'array' features that have lost one of
    the m/z in 'losses' and their complete counterpart is present in the
    data. The features to be removed must be higher than the given
    cut-off. All m/z and retention time (RT) matching are computed
    within tolerance.

    Keyword arguments:
        array      -- array with m/z, RT and index of the original
                      dataframe
        losses     -- neutral losses to subtract in order to detect
                      fragmented features
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    # Create an array with one m/z cut-off and neutral loss m/z per row
    fragsArray = numpy.stack((losses['MZCutOff'].values, losses['MZ'].values),
                             axis=-1)
    # Create a dictionary with cut-off values as keys and their
    # corresponding neutral loss m/z in lists as values
    fragsDict = {}
    for mzCutOff, mzLoss in fragsArray:
        fragsDict.setdefault(mzCutOff, []).append(mzLoss)
    matchIndexSet = set()
    for mzCutOff in viewkeys(fragsDict):
        # Get the index of the first m/z value in 'array' greater than
        # the m/z cut-off
        firstIndex = numpy.searchsorted(array[:, 0], mzCutOff)
        for index in range(firstIndex, len(array)):
            for mzLoss in fragsDict[mzCutOff]:
                # Look for in-source fragments, that is, features that
                # are the result of subtracting the neutral loss to the
                # parent's m/z and elute at the same RT
                fragMZ = array[index, 0] - mzLoss
                mzRange = mz_tol_range(fragMZ, parameters['mzFixedError'],
                                       parameters['mzPPMError'])
                # Get first and last indexes of the features within the
                # m/z range
                mzMatches = numpy.searchsorted(array[:, 0], mzRange)
                if (mzMatches[0] == mzMatches[1]):
                    continue
                # In order to be considered a match, each feature must
                # have the same RT
                minRT, maxRT = rt_tol_range(array[index, 1], RT_TOLERANCE)
                rtMatches = numpy.where(
                    (array[mzMatches[0]:mzMatches[1], 1] >= minRT)
                    & (array[mzMatches[0]:mzMatches[1], 1] <= maxRT))[0]
                if (len(rtMatches) == 0):
                    continue
                # The resultant indexes are based on the starting index
                # of the search ('mzMatches[0]')
                rtMatches += mzMatches[0]
                # The union of sets will handle any index repetition
                matchIndexSet.update(set(rtMatches))
    return list(matchIndexSet)