Beispiel #1
0
def _update_status(data, stepDst, verbose, stepNum):
    # type: (LFDataFrame, str, bool, int) -> None
    """Create CSV file from 'data' in 'stepDst', update progress bar and
    return incremented step number.

    Keyword Arguments:
        data    -- LFDataFrame instance
        stepDst -- destination directory for CSV 'data' file
        verbose -- create CSV 'data' file?
        stepNum -- step number completed
    """
    # Update progress bar
    print_progress_bar(INCREMENT * stepNum, 100, prefix='PeakFilter progress:')
    if (verbose):
        # Create a CSV file with the whole processed dataframe
        outFileName = 'peakfilter_step_{:02d}.csv'.format(stepNum)
        data.to_csv(os.path.join(stepDst, outFileName), index=False)
    stepNum += 1
    return stepNum
Beispiel #2
0
def amalgamate_data(negData, posData, parameters, dst=''):
    # type: (object, object, LFParameters, str) -> None
    """Amalgamate negative and positive ion polarity dataframes.

    'negData' and 'posData' have to match the same column layout as the
    output files from LipidFinder's PeakFilter module.
    For those frames with matching m/z and retention time, the one with
    the lowest total intensity mean is discarded. Both files must have
    the same column headings. If 'dst' is not an absolute path, the
    current working directory will be used as starting point. If
    "amalgamated.csv" file already exists, it will be overwritten.

    Keyword Arguments:
        negData    -- negative polarity LFDataFrame or pandas.DataFrame
                      instance
        posData    -- positive polarity LFDataFrame or pandas.DataFrame
                      instance
        parameters -- LipidFinder's Amalgamator parameters instance
        dst        -- destination directory where the log file and the
                      amalgamated data CSV file will be saved
                      [default: current working directory]
    """
    # Set the log file where the information about the steps performed
    # is saved
    logFilePath = 'amalgamator.log'
    if (dst):
        logFilePath = os.path.join(dst, logFilePath)
    # Create logger and its file handler
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler(logFilePath)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    # Write initial information in log file
    logger.info(("Starting Amalgamator. Negative dataframe has %d rows and "
                 "Positive dataframe has %d rows."), len(negData.index),
                len(posData.index))
    mzCol = parameters['mzCol']
    rtCol = parameters['rtCol']
    # Check if columns in both dataframes are the same
    if (set(negData.columns) != set(posData.columns)):
        diffCols = set(negData.columns).symmetric_difference(posData.columns)
        raise IOError(("Input dataframes do not share the same column names: "
                       "{0}").format(', '.join(diffCols)))
    # Check for misspelling errors in m/z or retention time column names
    if ((mzCol not in negData.columns) or (rtCol not in negData.columns)):
        raise KeyError("Missing '{0}' or '{1}' column(s)".format(mzCol, rtCol))
    # Get the indices for intensity columns
    firstIndex = parameters['firstSampleIndex'] - 1
    lastIndex = firstIndex + parameters['numSamples']
    # Calculate the mean of every non-zero value of the mean columns of
    # each input dataframe and round it to the nearest integer. Replace
    # any NaN output from mean() by zero.
    totalMean = lambda x: numpy.rint(
        numpy.nan_to_num(x[numpy.where(x > 0)[0]].mean())).astype(int)
    negData['TotalMean'] = negData.iloc[:,
                                        firstIndex:lastIndex].apply(totalMean,
                                                                    axis=1)
    posData['TotalMean'] = posData.iloc[:,
                                        firstIndex:lastIndex].apply(totalMean,
                                                                    axis=1)
    nind = negData.index.values
    nmz = negData[mzCol].values
    nrt = negData[rtCol].values
    nmeans = negData['TotalMean'].values
    negCol = list(negData.columns.values)
    posCol = list(posData.columns.values)
    # Empty results dataframe
    results = pandas.DataFrame(columns=negCol)
    polColIndex = results.columns.get_loc('Polarity')
    # Start progress bar
    progress = 0
    total = len(nind) + 1
    print_progress_bar(progress, total, prefix='Amalgamator progress:')
    # Loop through indices in negative file
    for i in nind:
        # Update progress bar
        progress += 1
        print_progress_bar(progress, total, prefix='Amalgamator progress:')
        negMass = nmz[i]
        negRT = nrt[i]
        pmz = posData[mzCol].values
        prt = posData[rtCol].values
        pmeans = posData['TotalMean'].values
        negMassH2 = negMass + HYDROGEN
        mzRange = mz_tol_range(negMassH2, parameters['mzFixedError'],
                               parameters['mzPPMError'])
        rtRange = rt_tol_range(negRT, parameters['maxRTDiffAdjFrame'])
        matchesH2 = list(
            numpy.where((pmz >= mzRange[0]) & (pmz <= mzRange[1])
                        & (prt >= rtRange[0])
                        & (prt <= rtRange[1]))[0])
        # First, look for H2 matches
        if (matchesH2):
            indMatch = __bestMatch__(matchesH2, negMassH2, pmz, negRT, prt,
                                     parameters)
            # Keep the frame with the highest total mean
            if (pmeans[indMatch] > nmeans[i]):
                results = results.append(posData.iloc[indMatch],
                                         ignore_index=True)
                if (parameters['combineIntensities']):
                    results.iloc[-1, firstIndex : lastIndex] = \
                            results.iloc[-1, firstIndex : lastIndex] \
                            + negData.iloc[i, firstIndex : lastIndex]
                    results.iloc[-1, polColIndex] += ' (Combined)'
                else:
                    results.iloc[-1, polColIndex] += ' (Both)'
            else:
                results = results.append(negData.iloc[i], ignore_index=True)
                if (parameters['combineIntensities']):
                    results.iloc[-1, firstIndex : lastIndex] = \
                            results.iloc[-1, firstIndex : lastIndex] \
                            + posData.iloc[indMatch, firstIndex : lastIndex]
                    results.iloc[-1, polColIndex] += ' (Combined)'
                else:
                    results.iloc[-1, polColIndex] += ' (Both)'
            logger.info('Match found: Negative ID %d - Positive ID %d.',
                        negData.iloc[i, 0], posData.iloc[indMatch, 0])
            # Remove match from positive dataframe, avoiding writing
            # the action to the log file
            if (isinstance(posData, LFDataFrame)):
                super(LFDataFrame, posData).drop(indMatch, inplace=True)
            else:
                posData.drop(indMatch, inplace=True)
            posData.reset_index(inplace=True, drop=True)
            pmz = posData[mzCol].values
            prt = posData[rtCol].values
            pmeans = posData['TotalMean'].values
            continue
        # If there are no H2 matches, look for CH4 matches
        negMassCH4 = negMass + METHANE
        mzRange = mz_tol_range(negMassCH4, parameters['mzFixedError'],
                               parameters['mzPPMError'])
        matchesHCH3 = list(
            numpy.where((pmz >= mzRange[0]) & (pmz <= mzRange[1])
                        & (prt >= rtRange[0])
                        & (prt <= rtRange[1]))[0])
        if (matchesHCH3):
            indMatch = __bestMatch__(matchesHCH3, negMassCH4, pmz, negRT, prt,
                                     parameters)
            # Keep the frame with the highest total mean
            if (pmeans[indMatch] > nmeans[i]):
                results = results.append(posData.iloc[indMatch],
                                         ignore_index=True)
                if (parameters['combineIntensities']):
                    results.iloc[-1, firstIndex : lastIndex] = \
                            results.iloc[-1, firstIndex : lastIndex] \
                            + negData.iloc[i, firstIndex : lastIndex]
                    results.iloc[-1, polColIndex] += ' (Combined)'
                else:
                    results.iloc[-1, polColIndex] += ' (Both)'
            else:
                results = results.append(negData.iloc[i], ignore_index=True)
                if (parameters['combineIntensities']):
                    results.iloc[-1, firstIndex : lastIndex] = \
                            results.iloc[-1, firstIndex : lastIndex] \
                            + posData.iloc[indMatch, firstIndex : lastIndex]
                    results.iloc[-1, polColIndex] += ' (Combined)'
                else:
                    results.iloc[-1, polColIndex] += ' (Both)'
            logger.info('Match found: Negative ID %d - Positive ID %d.',
                        negData.iloc[i, 0], posData.iloc[indMatch, 0])
            # Remove match from positive dataframe, avoiding writing
            # the action to the log file
            if (isinstance(posData, LFDataFrame)):
                super(LFDataFrame, posData).drop(indMatch, inplace=True)
            else:
                posData.drop(indMatch, inplace=True)
            posData.reset_index(inplace=True, drop=True)
            pmz = posData[mzCol].values
            prt = posData[rtCol].values
            pmeans = posData['TotalMean'].values
            continue
        results = results.append(negData.iloc[i], ignore_index=True)
    # Append what remains in the positive dataframe (unmatched positive
    # m/z values)
    results = results.append(posData, ignore_index=True)
    if (pandas.__version__ < '0.23.0'):
        # Fix unexpected column sorting from append() in pandas v0.20.3
        # or newer (solved in v0.23.0 with argument "sort=False")
        results = results.reindex(negCol, axis=1)
    results.drop('TotalMean', axis=1, inplace=True)
    # Sort results by m/z and retention time and create the CSV file
    results.sort_values([mzCol, rtCol], inplace=True, kind='mergesort')
    results.to_csv(os.path.join(dst, 'amalgamated.csv'), index=False)
    # Update progress bar
    print_progress_bar(total, total, prefix='Amalgamator progress:')
    # Write the final information in log file and remove handler
    logger.info('Amalgamator completed. Output dataframe has %d rows.\n',
                len(results.index))
    handler.close()
    logger.removeHandler(handler)
Beispiel #3
0
def bulk_structure_search(data, parameters, dst=''):
    # type: (object, LFParameters, str) -> None
    """Search in LIPID MAPS for matches of the m/z values in the input
    dataframe.

    'data' must have, at least, m/z, retention time (RT) and "Polarity"
    columns. The adducts included in the search as well as the specific
    in-house lipidomics database, the mass tolerance and the lipid
    categories are provided in 'parameters'.
    The resulting dataframe will include every bulk structure match for
    each m/z, including its RT, main class, category and other relevant
    information. If 'dst' is not an absolute path, the current working
    directory will be used as starting point. If "mssearch_<db>.xslx"
    already exists, it will be overwritten without warning.
    "<db>" stands for the selected LIPID MAPS database.

    Keyword arguments:
        data       -- LFDataFrame or pandas.DataFrame instance
        parameters -- LipidFinder's MS Search parameters instance
        dst        -- destination directory where the log file, the
                      output XSLX file and the category scatter plot
                      figure (if selected) will be saved
                      [default: current working directory]
    """
    # Set the log file where the information about the steps performed
    # is saved
    logFilePath = 'mssearch.log'
    if (dst):
        logFilePath = os.path.join(dst, logFilePath)
    # Create logger and its file handler
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler(logFilePath)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    # Write initial information in log file
    logger.info('Starting MS Search on %s. Input dataframe has %d rows.',
                 parameters['database'], len(data.index))
    # Start progress bar
    progress = 0
    print_progress_bar(progress, 100, prefix='MSSearch progress:')
    mzCol = parameters['mzCol']
    rtCol = parameters['rtCol']
    # Get the list of unique m/z values from 'data'
    mzList = data[mzCol].unique().tolist()
    numMZ = len(mzList)
    # Write initial information in log file
    logger.info('%d unique m/z values found.', numMZ)
    # Get the list of target adducts from the parameters
    targetAdducts = parameters['targetAdducts']
    if (not targetAdducts):
        # If the list is empty, use the complete list of ion adducts
        targetAdducts = parameters._parameters['targetAdducts']['options']
    # Keep only the adduct information between brackets
    targetAdducts = [x[x.find('[') + 1 : x.find(']')] for x in targetAdducts]
    targetAdducts = ','.join(targetAdducts)
    if (parameters['mzToleranceUnit'] == 'Daltons'):
        tolerance = parameters['mzTolerance']
    # Get matches in batches to balance the number of requests and the
    # amount of information requested
    matches = pandas.DataFrame()
    # Calculate progress increment for each batch
    increment = 63.0 / numpy.ceil(float(numMZ) / BATCH_SIZE)
    for start in range(0, numMZ, BATCH_SIZE):
        mzBatch = mzList[start : start + BATCH_SIZE]
        # Get a string with one m/z per line (text file alike)
        mzStr = os.linesep.join(map(str, mzBatch))
        if (parameters['mzToleranceUnit'] == 'PPM'):
            # Calculate maximum tolerance in Da from tolerance in parts
            # per million (ppm)
            tolerance = mzBatch[-1] * parameters['mzTolerance'] / 1e6
        # Create the data package with the query
        if (parameters['categories']):
            mpData = MultipartEncoder(
                    fields={'CHOICE': parameters['database'], 'sort': 'DELTA',
                            'file': ('file', StringIO(mzStr), 'text/plain'),
                            'tol': str(tolerance), 'ion': targetAdducts,
                            'even': '2' if parameters['evenChains'] else '1',
                            'category': ','.join(parameters['categories'])})
        else:
            mpData = MultipartEncoder(
                    fields={'CHOICE': parameters['database'], 'sort': 'DELTA',
                            'file': ('file', StringIO(mzStr), 'text/plain'),
                            'tol': str(tolerance), 'ion': targetAdducts,
                            'even': '2' if parameters['evenChains'] else '1'})
        # Request the table containing the matches from LIPID MAPS
        try:
            response = requests.post(
                    LIPIDMAPS_URL, data=mpData,
                    headers={'Content-Type': mpData.content_type})
        except:
            raise Exception(("Connection error with the database. Please check "
                             "your network and try again after a few minutes."))
        # Go to next batch if this one returned nothing
        if (len(response.text) == 0):
            # Update progress bar
            progress += increment
            print_progress_bar(progress, 100, prefix='MSSearch progress:')
            continue
        # Process the response to create a dataframe
        batchMatches = pandas.read_csv(StringIO(response.text), sep='\t',
                                         engine='python', index_col=False)
        if (batchMatches.empty):
            # Update progress bar
            progress += increment
            print_progress_bar(progress, 100, prefix='MSSearch progress:')
            continue
        # Join all the information already gathered
        matches = matches.append(batchMatches, ignore_index=True)
        # Update progress bar
        progress += increment
        print_progress_bar(progress, 100, prefix='MSSearch progress:')
    if (matches.empty):
        matches = pandas.DataFrame(
                columns=[mzCol, 'Matched MZ', 'Delta', 'Bulk Structure',
                         'Formula', 'Adduct', 'Main Class', 'Category'])
    else:
        # Rename m/z column
        matches.rename(columns={'Input Mass': mzCol}, inplace=True)
    # Round 'Input Mass' values that might have been altered by LIPID
    # MAPS server
    matches[mzCol] = matches[mzCol].apply(round, ndigits=data._resolution)
    # Calculate the delta PPM of each row and add it to the dataframe
    dPPM = abs(matches[mzCol] - matches['Matched MZ']) * 1e6 / matches[mzCol]
    matches.insert(2, 'Delta_PPM', dPPM)
    if (parameters['mzToleranceUnit'] == 'PPM'):
        # Make sure all the matches comply with the m/z tolerance in ppm
        matches = matches[matches['Delta_PPM'] <= parameters['mzTolerance']]
    # Add RT and polarity values to each existing record and include the
    # rows in 'data' that did not have a match
    matches.insert(3, rtCol, 0.0)
    matches.insert(4, 'Polarity', '')
    # Calculate progress increment for each batch
    increment = 33.0 / numpy.ceil(len(data) / float(BATCH_SIZE))
    # Create result dataframe with all the columns in that dataframe
    colNames = [x for x in list(data) if x not in [mzCol, rtCol, 'Polarity']]
    extraCols = []
    for column in colNames:
        if (column not in list(matches)):
            extraCols.append(column)
        else:
            # Keep all columns from source dataset, adding prefix "src_"
            # if that column name is already in the dataframe
            extraCols.append('src_' + column)
            data.rename(columns={column: 'src_' + column}, inplace=True)
    result = pandas.DataFrame(columns=list(matches) + extraCols)
    # Ensure the polarity column contains only strings so the
    # conditional test in the next loop works as expected
    data['Polarity'].replace(numpy.nan, '', regex=True, inplace=True)
    # For those m/z values with more than one RT, the whole set of
    # matches is replicated for every RT
    for index, row in data.iterrows():
        mzMatches = matches.loc[matches[mzCol] == row[mzCol]]
        # Remove positive adduct matches for m/z found in negative mode,
        # and negative adduct matches for m/z found in positive mode
        if (row['Polarity'].lower().startswith('n')):
            mzMatches = mzMatches.loc[mzMatches['Adduct'].str[-1] != '+']
        elif (row['Polarity'].lower().startswith('p')):
            mzMatches = mzMatches.loc[mzMatches['Adduct'].str[-1] != '-']
        if (mzMatches.empty):
            # Unmatched m/z from 'data'
            mzMatches = mzMatches.append(row[[mzCol, rtCol, 'Polarity']],
                                         ignore_index = True)
        else:
            # Copy RT and polarity values to each matched m/z
            mzMatches[rtCol] = row[rtCol]
            mzMatches['Polarity'] = row['Polarity']
        # Copy the extra columns (if any) to each matched m/z
        for col in extraCols:
            mzMatches[col] = row[col]
        result = result.append(mzMatches, ignore_index=True)
        if ((index + 1) % BATCH_SIZE == 0):
            # Update progress bar
            progress += increment
            print_progress_bar(progress, 100, prefix='MSSearch progress:')
    # Sort the results by m/z, delta PPM and matched m/z to ease the
    # manipulation of the output XLSX file
    result.sort_values([mzCol, 'Delta_PPM', 'Matched MZ'], inplace=True,
                       kind='mergesort')
    # Create the XLSX file with the whole putative profiling dataframe
    outPath = os.path.join(
            dst, 'mssearch_{0}.xlsx'.format(parameters['database'].lower()))
    result.to_excel(outPath, index=False, engine='xlsxwriter')
    if (parameters['summary']):
        # Create summary XLSX file from the putative profiling
        # dataframe, keeping only one row per m/z and RT with the most
        # frequent lipid category
        Summary.create_summary(result, parameters, dst)
    # Update progress bar
    print_progress_bar(98, 100, prefix='MSSearch progress:')
    # Generate the category scatter plot of the most common lipid
    # category per m/z and RT
    if (parameters['plotCategories']):
        DataPlots.category_scatterplot(result, parameters, dst)
    # Update progress bar
    print_progress_bar(100, 100, prefix='MSSearch progress:')
    # Write the final information in log file and close handler
    matches = result[result['Category'].notna()]
    logger.info('MS Search completed. %d matches found for %d m/z values.\n',
                 len(matches), len(matches[mzCol].unique()))
    handler.close()
    logger.removeHandler(handler)
Beispiel #4
0
def peak_filter(data, parameters, dst='', verbose=False):
    # type: (LFDataFrame, LFParameters, str, bool) -> None
    """Filter contaminants and redundant artifacts from a LC/MS data
    pre-processed by XCMS or another pre-processing tool.

    If 'dst' is not an absolute path, the current working directory will
    be used as starting point. If either "peakfilter_<polarity>.csv" or
    "peakfilter_<polarity>_summary.csv" files already exist, they will
    be overwritten. "<polarity>" stands for "positive" or "negative", as
    stated in the parameters.

    Keyword Arguments:
        data       -- LFDataFrame instance
        parameters -- LipidFinder's PeakFilter parameters instance
        dst        -- destination directory where the log file, the
                      processed data CSV file and the summary CSV file
                      will be saved [default: current working directory]
        verbose    -- create folder inside 'dst' where the intermediate
                      results will be saved in CSV files
    """
    # Start progress bar
    print_progress_bar(0, 100, prefix='PeakFilter progress:')
    # Set the log file where the information about the steps performed
    # is saved
    logFilePath = 'peakfilter.log'
    if (dst):
        logFilePath = os.path.join(dst, logFilePath)
    # Create logger and its file handler
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    handler = logging.FileHandler(logFilePath)
    handler.setLevel(logging.INFO)
    formatter = logging.Formatter('[%(asctime)s] %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    # Write initial information in log file
    logger.info('Starting PeakFilter. Input dataframe ("%s") has %d rows.',
                 data.src, len(data.index))
    # Prepare the folder structure to store the intermediate files
    stepDst = os.path.join(dst, 'step_by_step')
    if (verbose and not os.path.isdir(stepDst)):
        os.makedirs(stepDst)
    stepNum = 1
    # QC Sample Calculations
    if (parameters['numQCReps'] > 0):
        # Perform mean and RSD on QC samples
        qcRatio = QCCalcs.qc_rsd_ratio(data, parameters)
        # Write report in log file
        logger.info(("QC Sample Calculations completed. %.1f%% samples between"
                      " %d%% and %d%% QC-RSD"), qcRatio, parameters["QCRSD"][0],
                     parameters["QCRSD"][1])
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Solvent Calculations
    if ((parameters['numSolventReps'] > 0) and parameters['removeSolvents']):
        # Perform mean and RSD on solvent samples, perform the outlier
        # correction, remove frames where all technical replicates of
        # all samples are less than the 'solventMinFoldDiff' times the
        # solvent mean, and remove solvent mean intensity from remaining
        # intensities of samples replicates
        SolventCalcs.remove_solvent_effect(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Background correction: remove low intensity frames
    SolventCalcs.remove_low_intensity_frames(data, parameters)
    if (parameters['preprocSoftware'] == 'XCMS'):
        # Get m/z clusters required by 'MassReassignment' and
        # 'BroadContaminant' modules
        Clustering.cluster_by_mz(data, parameters)
        # Create the "FeatureClusterID" column that will be used by
        # 'RTCorrection' step. In XCMS, each row is already a feature.
        data['FeatureClusterID'] = range(1, len(data) + 1)
    else:
        # Perform peak finding for any other pre-processing software
        PeakFinder.process_features(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # In-source ion fragment removal
    if (parameters['removeIonFrags']):
        InSrcFragRemoval.remove_in_src_frags(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Contaminant removal
    if (parameters['removeContaminants']):
        ContaminantRemoval.remove_contaminants(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Adduct removal
    if (parameters['removeAdducts']):
        ContaminantRemoval.remove_adducts(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Stack removal
    if (parameters['removeStacks']):
        ContaminantRemoval.remove_stacks(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Retention time correction of each set of sample replicates to fix
    # other pre-processing tool's likely alignment errors
    if ((parameters['numTechReps'] > 1)
        and (parameters['preprocSoftware'] == 'Other')):
        RTCorrection.correct_retention_time(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Remove outliers from sample replicates
    OutlierCorrection.remove_outliers(data, parameters, src='samples')
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Calculate and add the mean of each sample's replicates
    SampleMeansCalc.calculate_sample_means(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Retention time correction to the means of the sample replicates
    if (parameters['correctRTMeans']):
        RTCorrection.correct_retention_time(data, parameters, True)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Assign each m/z in either a mass or feature cluster to the m/z of
    # the row containing the highest sample mean intensity
    MassReassignment.reassign_frame_masses(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Remove ions with similar intensities for the same m/z that are
    # likely to be contaminants
    BroadContaminant.process_all_features(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Isotope removal
    Deisotoping.remove_isotopes(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Mass defect filter: remove salt clusters
    if (parameters['filterMassDefect']):
        MassDefectFilter.remove_salt_clusters(data, parameters)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Calculate the False Discovery Rate
    if (parameters['calculateFDR']):
        try:
            fdrValue = FalseDiscoveryRate.get_fdr(data, parameters)
            message = ("False Discovery Rate for selected data and parameters: "
                       "{0:.2%}").format(fdrValue)
        except ValueError as e:
            message = 'ValueError: ' + e.args[0]
        except Exception as oe:
            message = 'OtherError: ' + oe.args[0]
        logger.info(message)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Create summary CSV file from the processed dataframe
    Summary.create_summary(data, parameters, dst)
    stepNum = _update_status(data, stepDst, verbose, stepNum)
    # Create a CSV file with the whole processed dataframe
    data['Polarity'] = parameters['polarity']
    outFileName = 'peakfilter_{0}.csv'.format(parameters['polarity'].lower())
    data.to_csv(os.path.join(dst, outFileName), index=False)
    # Update progress bar
    print_progress_bar(100, 100, prefix='PeakFilter progress:')
    # Print False Discovery Rate message
    if (parameters['calculateFDR']):
        print(message)
    # Write the final information in log file and close handler
    logger.info('PeakFilter completed. Output dataframe has %d rows.\n',
                 len(data.index))
    handler.close()
    logger.removeHandler(handler)