def _update_status(data, stepDst, verbose, stepNum): # type: (LFDataFrame, str, bool, int) -> None """Create CSV file from 'data' in 'stepDst', update progress bar and return incremented step number. Keyword Arguments: data -- LFDataFrame instance stepDst -- destination directory for CSV 'data' file verbose -- create CSV 'data' file? stepNum -- step number completed """ # Update progress bar print_progress_bar(INCREMENT * stepNum, 100, prefix='PeakFilter progress:') if (verbose): # Create a CSV file with the whole processed dataframe outFileName = 'peakfilter_step_{:02d}.csv'.format(stepNum) data.to_csv(os.path.join(stepDst, outFileName), index=False) stepNum += 1 return stepNum
def amalgamate_data(negData, posData, parameters, dst=''): # type: (object, object, LFParameters, str) -> None """Amalgamate negative and positive ion polarity dataframes. 'negData' and 'posData' have to match the same column layout as the output files from LipidFinder's PeakFilter module. For those frames with matching m/z and retention time, the one with the lowest total intensity mean is discarded. Both files must have the same column headings. If 'dst' is not an absolute path, the current working directory will be used as starting point. If "amalgamated.csv" file already exists, it will be overwritten. Keyword Arguments: negData -- negative polarity LFDataFrame or pandas.DataFrame instance posData -- positive polarity LFDataFrame or pandas.DataFrame instance parameters -- LipidFinder's Amalgamator parameters instance dst -- destination directory where the log file and the amalgamated data CSV file will be saved [default: current working directory] """ # Set the log file where the information about the steps performed # is saved logFilePath = 'amalgamator.log' if (dst): logFilePath = os.path.join(dst, logFilePath) # Create logger and its file handler logger = logging.getLogger() logger.setLevel(logging.INFO) handler = logging.FileHandler(logFilePath) handler.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) # Write initial information in log file logger.info(("Starting Amalgamator. Negative dataframe has %d rows and " "Positive dataframe has %d rows."), len(negData.index), len(posData.index)) mzCol = parameters['mzCol'] rtCol = parameters['rtCol'] # Check if columns in both dataframes are the same if (set(negData.columns) != set(posData.columns)): diffCols = set(negData.columns).symmetric_difference(posData.columns) raise IOError(("Input dataframes do not share the same column names: " "{0}").format(', '.join(diffCols))) # Check for misspelling errors in m/z or retention time column names if ((mzCol not in negData.columns) or (rtCol not in negData.columns)): raise KeyError("Missing '{0}' or '{1}' column(s)".format(mzCol, rtCol)) # Get the indices for intensity columns firstIndex = parameters['firstSampleIndex'] - 1 lastIndex = firstIndex + parameters['numSamples'] # Calculate the mean of every non-zero value of the mean columns of # each input dataframe and round it to the nearest integer. Replace # any NaN output from mean() by zero. totalMean = lambda x: numpy.rint( numpy.nan_to_num(x[numpy.where(x > 0)[0]].mean())).astype(int) negData['TotalMean'] = negData.iloc[:, firstIndex:lastIndex].apply(totalMean, axis=1) posData['TotalMean'] = posData.iloc[:, firstIndex:lastIndex].apply(totalMean, axis=1) nind = negData.index.values nmz = negData[mzCol].values nrt = negData[rtCol].values nmeans = negData['TotalMean'].values negCol = list(negData.columns.values) posCol = list(posData.columns.values) # Empty results dataframe results = pandas.DataFrame(columns=negCol) polColIndex = results.columns.get_loc('Polarity') # Start progress bar progress = 0 total = len(nind) + 1 print_progress_bar(progress, total, prefix='Amalgamator progress:') # Loop through indices in negative file for i in nind: # Update progress bar progress += 1 print_progress_bar(progress, total, prefix='Amalgamator progress:') negMass = nmz[i] negRT = nrt[i] pmz = posData[mzCol].values prt = posData[rtCol].values pmeans = posData['TotalMean'].values negMassH2 = negMass + HYDROGEN mzRange = mz_tol_range(negMassH2, parameters['mzFixedError'], parameters['mzPPMError']) rtRange = rt_tol_range(negRT, parameters['maxRTDiffAdjFrame']) matchesH2 = list( numpy.where((pmz >= mzRange[0]) & (pmz <= mzRange[1]) & (prt >= rtRange[0]) & (prt <= rtRange[1]))[0]) # First, look for H2 matches if (matchesH2): indMatch = __bestMatch__(matchesH2, negMassH2, pmz, negRT, prt, parameters) # Keep the frame with the highest total mean if (pmeans[indMatch] > nmeans[i]): results = results.append(posData.iloc[indMatch], ignore_index=True) if (parameters['combineIntensities']): results.iloc[-1, firstIndex : lastIndex] = \ results.iloc[-1, firstIndex : lastIndex] \ + negData.iloc[i, firstIndex : lastIndex] results.iloc[-1, polColIndex] += ' (Combined)' else: results.iloc[-1, polColIndex] += ' (Both)' else: results = results.append(negData.iloc[i], ignore_index=True) if (parameters['combineIntensities']): results.iloc[-1, firstIndex : lastIndex] = \ results.iloc[-1, firstIndex : lastIndex] \ + posData.iloc[indMatch, firstIndex : lastIndex] results.iloc[-1, polColIndex] += ' (Combined)' else: results.iloc[-1, polColIndex] += ' (Both)' logger.info('Match found: Negative ID %d - Positive ID %d.', negData.iloc[i, 0], posData.iloc[indMatch, 0]) # Remove match from positive dataframe, avoiding writing # the action to the log file if (isinstance(posData, LFDataFrame)): super(LFDataFrame, posData).drop(indMatch, inplace=True) else: posData.drop(indMatch, inplace=True) posData.reset_index(inplace=True, drop=True) pmz = posData[mzCol].values prt = posData[rtCol].values pmeans = posData['TotalMean'].values continue # If there are no H2 matches, look for CH4 matches negMassCH4 = negMass + METHANE mzRange = mz_tol_range(negMassCH4, parameters['mzFixedError'], parameters['mzPPMError']) matchesHCH3 = list( numpy.where((pmz >= mzRange[0]) & (pmz <= mzRange[1]) & (prt >= rtRange[0]) & (prt <= rtRange[1]))[0]) if (matchesHCH3): indMatch = __bestMatch__(matchesHCH3, negMassCH4, pmz, negRT, prt, parameters) # Keep the frame with the highest total mean if (pmeans[indMatch] > nmeans[i]): results = results.append(posData.iloc[indMatch], ignore_index=True) if (parameters['combineIntensities']): results.iloc[-1, firstIndex : lastIndex] = \ results.iloc[-1, firstIndex : lastIndex] \ + negData.iloc[i, firstIndex : lastIndex] results.iloc[-1, polColIndex] += ' (Combined)' else: results.iloc[-1, polColIndex] += ' (Both)' else: results = results.append(negData.iloc[i], ignore_index=True) if (parameters['combineIntensities']): results.iloc[-1, firstIndex : lastIndex] = \ results.iloc[-1, firstIndex : lastIndex] \ + posData.iloc[indMatch, firstIndex : lastIndex] results.iloc[-1, polColIndex] += ' (Combined)' else: results.iloc[-1, polColIndex] += ' (Both)' logger.info('Match found: Negative ID %d - Positive ID %d.', negData.iloc[i, 0], posData.iloc[indMatch, 0]) # Remove match from positive dataframe, avoiding writing # the action to the log file if (isinstance(posData, LFDataFrame)): super(LFDataFrame, posData).drop(indMatch, inplace=True) else: posData.drop(indMatch, inplace=True) posData.reset_index(inplace=True, drop=True) pmz = posData[mzCol].values prt = posData[rtCol].values pmeans = posData['TotalMean'].values continue results = results.append(negData.iloc[i], ignore_index=True) # Append what remains in the positive dataframe (unmatched positive # m/z values) results = results.append(posData, ignore_index=True) if (pandas.__version__ < '0.23.0'): # Fix unexpected column sorting from append() in pandas v0.20.3 # or newer (solved in v0.23.0 with argument "sort=False") results = results.reindex(negCol, axis=1) results.drop('TotalMean', axis=1, inplace=True) # Sort results by m/z and retention time and create the CSV file results.sort_values([mzCol, rtCol], inplace=True, kind='mergesort') results.to_csv(os.path.join(dst, 'amalgamated.csv'), index=False) # Update progress bar print_progress_bar(total, total, prefix='Amalgamator progress:') # Write the final information in log file and remove handler logger.info('Amalgamator completed. Output dataframe has %d rows.\n', len(results.index)) handler.close() logger.removeHandler(handler)
def bulk_structure_search(data, parameters, dst=''): # type: (object, LFParameters, str) -> None """Search in LIPID MAPS for matches of the m/z values in the input dataframe. 'data' must have, at least, m/z, retention time (RT) and "Polarity" columns. The adducts included in the search as well as the specific in-house lipidomics database, the mass tolerance and the lipid categories are provided in 'parameters'. The resulting dataframe will include every bulk structure match for each m/z, including its RT, main class, category and other relevant information. If 'dst' is not an absolute path, the current working directory will be used as starting point. If "mssearch_<db>.xslx" already exists, it will be overwritten without warning. "<db>" stands for the selected LIPID MAPS database. Keyword arguments: data -- LFDataFrame or pandas.DataFrame instance parameters -- LipidFinder's MS Search parameters instance dst -- destination directory where the log file, the output XSLX file and the category scatter plot figure (if selected) will be saved [default: current working directory] """ # Set the log file where the information about the steps performed # is saved logFilePath = 'mssearch.log' if (dst): logFilePath = os.path.join(dst, logFilePath) # Create logger and its file handler logger = logging.getLogger() logger.setLevel(logging.INFO) handler = logging.FileHandler(logFilePath) handler.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) # Write initial information in log file logger.info('Starting MS Search on %s. Input dataframe has %d rows.', parameters['database'], len(data.index)) # Start progress bar progress = 0 print_progress_bar(progress, 100, prefix='MSSearch progress:') mzCol = parameters['mzCol'] rtCol = parameters['rtCol'] # Get the list of unique m/z values from 'data' mzList = data[mzCol].unique().tolist() numMZ = len(mzList) # Write initial information in log file logger.info('%d unique m/z values found.', numMZ) # Get the list of target adducts from the parameters targetAdducts = parameters['targetAdducts'] if (not targetAdducts): # If the list is empty, use the complete list of ion adducts targetAdducts = parameters._parameters['targetAdducts']['options'] # Keep only the adduct information between brackets targetAdducts = [x[x.find('[') + 1 : x.find(']')] for x in targetAdducts] targetAdducts = ','.join(targetAdducts) if (parameters['mzToleranceUnit'] == 'Daltons'): tolerance = parameters['mzTolerance'] # Get matches in batches to balance the number of requests and the # amount of information requested matches = pandas.DataFrame() # Calculate progress increment for each batch increment = 63.0 / numpy.ceil(float(numMZ) / BATCH_SIZE) for start in range(0, numMZ, BATCH_SIZE): mzBatch = mzList[start : start + BATCH_SIZE] # Get a string with one m/z per line (text file alike) mzStr = os.linesep.join(map(str, mzBatch)) if (parameters['mzToleranceUnit'] == 'PPM'): # Calculate maximum tolerance in Da from tolerance in parts # per million (ppm) tolerance = mzBatch[-1] * parameters['mzTolerance'] / 1e6 # Create the data package with the query if (parameters['categories']): mpData = MultipartEncoder( fields={'CHOICE': parameters['database'], 'sort': 'DELTA', 'file': ('file', StringIO(mzStr), 'text/plain'), 'tol': str(tolerance), 'ion': targetAdducts, 'even': '2' if parameters['evenChains'] else '1', 'category': ','.join(parameters['categories'])}) else: mpData = MultipartEncoder( fields={'CHOICE': parameters['database'], 'sort': 'DELTA', 'file': ('file', StringIO(mzStr), 'text/plain'), 'tol': str(tolerance), 'ion': targetAdducts, 'even': '2' if parameters['evenChains'] else '1'}) # Request the table containing the matches from LIPID MAPS try: response = requests.post( LIPIDMAPS_URL, data=mpData, headers={'Content-Type': mpData.content_type}) except: raise Exception(("Connection error with the database. Please check " "your network and try again after a few minutes.")) # Go to next batch if this one returned nothing if (len(response.text) == 0): # Update progress bar progress += increment print_progress_bar(progress, 100, prefix='MSSearch progress:') continue # Process the response to create a dataframe batchMatches = pandas.read_csv(StringIO(response.text), sep='\t', engine='python', index_col=False) if (batchMatches.empty): # Update progress bar progress += increment print_progress_bar(progress, 100, prefix='MSSearch progress:') continue # Join all the information already gathered matches = matches.append(batchMatches, ignore_index=True) # Update progress bar progress += increment print_progress_bar(progress, 100, prefix='MSSearch progress:') if (matches.empty): matches = pandas.DataFrame( columns=[mzCol, 'Matched MZ', 'Delta', 'Bulk Structure', 'Formula', 'Adduct', 'Main Class', 'Category']) else: # Rename m/z column matches.rename(columns={'Input Mass': mzCol}, inplace=True) # Round 'Input Mass' values that might have been altered by LIPID # MAPS server matches[mzCol] = matches[mzCol].apply(round, ndigits=data._resolution) # Calculate the delta PPM of each row and add it to the dataframe dPPM = abs(matches[mzCol] - matches['Matched MZ']) * 1e6 / matches[mzCol] matches.insert(2, 'Delta_PPM', dPPM) if (parameters['mzToleranceUnit'] == 'PPM'): # Make sure all the matches comply with the m/z tolerance in ppm matches = matches[matches['Delta_PPM'] <= parameters['mzTolerance']] # Add RT and polarity values to each existing record and include the # rows in 'data' that did not have a match matches.insert(3, rtCol, 0.0) matches.insert(4, 'Polarity', '') # Calculate progress increment for each batch increment = 33.0 / numpy.ceil(len(data) / float(BATCH_SIZE)) # Create result dataframe with all the columns in that dataframe colNames = [x for x in list(data) if x not in [mzCol, rtCol, 'Polarity']] extraCols = [] for column in colNames: if (column not in list(matches)): extraCols.append(column) else: # Keep all columns from source dataset, adding prefix "src_" # if that column name is already in the dataframe extraCols.append('src_' + column) data.rename(columns={column: 'src_' + column}, inplace=True) result = pandas.DataFrame(columns=list(matches) + extraCols) # Ensure the polarity column contains only strings so the # conditional test in the next loop works as expected data['Polarity'].replace(numpy.nan, '', regex=True, inplace=True) # For those m/z values with more than one RT, the whole set of # matches is replicated for every RT for index, row in data.iterrows(): mzMatches = matches.loc[matches[mzCol] == row[mzCol]] # Remove positive adduct matches for m/z found in negative mode, # and negative adduct matches for m/z found in positive mode if (row['Polarity'].lower().startswith('n')): mzMatches = mzMatches.loc[mzMatches['Adduct'].str[-1] != '+'] elif (row['Polarity'].lower().startswith('p')): mzMatches = mzMatches.loc[mzMatches['Adduct'].str[-1] != '-'] if (mzMatches.empty): # Unmatched m/z from 'data' mzMatches = mzMatches.append(row[[mzCol, rtCol, 'Polarity']], ignore_index = True) else: # Copy RT and polarity values to each matched m/z mzMatches[rtCol] = row[rtCol] mzMatches['Polarity'] = row['Polarity'] # Copy the extra columns (if any) to each matched m/z for col in extraCols: mzMatches[col] = row[col] result = result.append(mzMatches, ignore_index=True) if ((index + 1) % BATCH_SIZE == 0): # Update progress bar progress += increment print_progress_bar(progress, 100, prefix='MSSearch progress:') # Sort the results by m/z, delta PPM and matched m/z to ease the # manipulation of the output XLSX file result.sort_values([mzCol, 'Delta_PPM', 'Matched MZ'], inplace=True, kind='mergesort') # Create the XLSX file with the whole putative profiling dataframe outPath = os.path.join( dst, 'mssearch_{0}.xlsx'.format(parameters['database'].lower())) result.to_excel(outPath, index=False, engine='xlsxwriter') if (parameters['summary']): # Create summary XLSX file from the putative profiling # dataframe, keeping only one row per m/z and RT with the most # frequent lipid category Summary.create_summary(result, parameters, dst) # Update progress bar print_progress_bar(98, 100, prefix='MSSearch progress:') # Generate the category scatter plot of the most common lipid # category per m/z and RT if (parameters['plotCategories']): DataPlots.category_scatterplot(result, parameters, dst) # Update progress bar print_progress_bar(100, 100, prefix='MSSearch progress:') # Write the final information in log file and close handler matches = result[result['Category'].notna()] logger.info('MS Search completed. %d matches found for %d m/z values.\n', len(matches), len(matches[mzCol].unique())) handler.close() logger.removeHandler(handler)
def peak_filter(data, parameters, dst='', verbose=False): # type: (LFDataFrame, LFParameters, str, bool) -> None """Filter contaminants and redundant artifacts from a LC/MS data pre-processed by XCMS or another pre-processing tool. If 'dst' is not an absolute path, the current working directory will be used as starting point. If either "peakfilter_<polarity>.csv" or "peakfilter_<polarity>_summary.csv" files already exist, they will be overwritten. "<polarity>" stands for "positive" or "negative", as stated in the parameters. Keyword Arguments: data -- LFDataFrame instance parameters -- LipidFinder's PeakFilter parameters instance dst -- destination directory where the log file, the processed data CSV file and the summary CSV file will be saved [default: current working directory] verbose -- create folder inside 'dst' where the intermediate results will be saved in CSV files """ # Start progress bar print_progress_bar(0, 100, prefix='PeakFilter progress:') # Set the log file where the information about the steps performed # is saved logFilePath = 'peakfilter.log' if (dst): logFilePath = os.path.join(dst, logFilePath) # Create logger and its file handler logger = logging.getLogger() logger.setLevel(logging.INFO) handler = logging.FileHandler(logFilePath) handler.setLevel(logging.INFO) formatter = logging.Formatter('[%(asctime)s] %(message)s') handler.setFormatter(formatter) logger.addHandler(handler) # Write initial information in log file logger.info('Starting PeakFilter. Input dataframe ("%s") has %d rows.', data.src, len(data.index)) # Prepare the folder structure to store the intermediate files stepDst = os.path.join(dst, 'step_by_step') if (verbose and not os.path.isdir(stepDst)): os.makedirs(stepDst) stepNum = 1 # QC Sample Calculations if (parameters['numQCReps'] > 0): # Perform mean and RSD on QC samples qcRatio = QCCalcs.qc_rsd_ratio(data, parameters) # Write report in log file logger.info(("QC Sample Calculations completed. %.1f%% samples between" " %d%% and %d%% QC-RSD"), qcRatio, parameters["QCRSD"][0], parameters["QCRSD"][1]) stepNum = _update_status(data, stepDst, verbose, stepNum) # Solvent Calculations if ((parameters['numSolventReps'] > 0) and parameters['removeSolvents']): # Perform mean and RSD on solvent samples, perform the outlier # correction, remove frames where all technical replicates of # all samples are less than the 'solventMinFoldDiff' times the # solvent mean, and remove solvent mean intensity from remaining # intensities of samples replicates SolventCalcs.remove_solvent_effect(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Background correction: remove low intensity frames SolventCalcs.remove_low_intensity_frames(data, parameters) if (parameters['preprocSoftware'] == 'XCMS'): # Get m/z clusters required by 'MassReassignment' and # 'BroadContaminant' modules Clustering.cluster_by_mz(data, parameters) # Create the "FeatureClusterID" column that will be used by # 'RTCorrection' step. In XCMS, each row is already a feature. data['FeatureClusterID'] = range(1, len(data) + 1) else: # Perform peak finding for any other pre-processing software PeakFinder.process_features(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # In-source ion fragment removal if (parameters['removeIonFrags']): InSrcFragRemoval.remove_in_src_frags(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Contaminant removal if (parameters['removeContaminants']): ContaminantRemoval.remove_contaminants(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Adduct removal if (parameters['removeAdducts']): ContaminantRemoval.remove_adducts(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Stack removal if (parameters['removeStacks']): ContaminantRemoval.remove_stacks(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Retention time correction of each set of sample replicates to fix # other pre-processing tool's likely alignment errors if ((parameters['numTechReps'] > 1) and (parameters['preprocSoftware'] == 'Other')): RTCorrection.correct_retention_time(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Remove outliers from sample replicates OutlierCorrection.remove_outliers(data, parameters, src='samples') stepNum = _update_status(data, stepDst, verbose, stepNum) # Calculate and add the mean of each sample's replicates SampleMeansCalc.calculate_sample_means(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Retention time correction to the means of the sample replicates if (parameters['correctRTMeans']): RTCorrection.correct_retention_time(data, parameters, True) stepNum = _update_status(data, stepDst, verbose, stepNum) # Assign each m/z in either a mass or feature cluster to the m/z of # the row containing the highest sample mean intensity MassReassignment.reassign_frame_masses(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Remove ions with similar intensities for the same m/z that are # likely to be contaminants BroadContaminant.process_all_features(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Isotope removal Deisotoping.remove_isotopes(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Mass defect filter: remove salt clusters if (parameters['filterMassDefect']): MassDefectFilter.remove_salt_clusters(data, parameters) stepNum = _update_status(data, stepDst, verbose, stepNum) # Calculate the False Discovery Rate if (parameters['calculateFDR']): try: fdrValue = FalseDiscoveryRate.get_fdr(data, parameters) message = ("False Discovery Rate for selected data and parameters: " "{0:.2%}").format(fdrValue) except ValueError as e: message = 'ValueError: ' + e.args[0] except Exception as oe: message = 'OtherError: ' + oe.args[0] logger.info(message) stepNum = _update_status(data, stepDst, verbose, stepNum) # Create summary CSV file from the processed dataframe Summary.create_summary(data, parameters, dst) stepNum = _update_status(data, stepDst, verbose, stepNum) # Create a CSV file with the whole processed dataframe data['Polarity'] = parameters['polarity'] outFileName = 'peakfilter_{0}.csv'.format(parameters['polarity'].lower()) data.to_csv(os.path.join(dst, outFileName), index=False) # Update progress bar print_progress_bar(100, 100, prefix='PeakFilter progress:') # Print False Discovery Rate message if (parameters['calculateFDR']): print(message) # Write the final information in log file and close handler logger.info('PeakFilter completed. Output dataframe has %d rows.\n', len(data.index)) handler.close() logger.removeHandler(handler)