Esempio n. 1
0
def feature_per_intensity_histogram():
    """
    Example of how to make a histogram of the features per intensity.
    
    Takes a csv file which contains information on all the features. The important information are the intensities. Because every row is one feature, to get the features
    per intensity it is possible to count the occurence of each intensity. It makes a vector out of all the intensities per feature in example_files/input/feature.csv.
    It retrieves vectors of information about the feature, so with little tweaking this method can be used any time that you can make a vector out 
    of your data. For better viewing the data is logged with base 10.
    
    This example script uses the following functions:
      - L{rFunctions.readCsvFile}
      - L{rFunctions.getRowsWithUniqColumn}
      - L{rFunctions.index}
      - L{rFunctions.takeLog}
      - L{rPlots.Plots.histogram}
    
      
    >>> from pyMSA import rFunctions
    >>> from pyMSA import rPlots
    >>> featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')                                                    # Read a csv file into a rpy2 dataframe with featurs and intensities
    >>> featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')                                                    # Remove the rows with redundant id's from featDataframe
    >>> featIntensityVector = featDataframe[rFunctions.index(featDataframeUniq, 'intensity')]                                        # Retrieve a vector of intensities from the unique features
    >>> featLogIntensityVector = rFunctions.takeLog(featIntensityVector,10)                                                          # Take the logarithm of all the values in the vector with base 10
    >>> rplots = rPlots.Plots()                                                                                                      # Instantiate rPlots.Plots()
    >>> plots.histogram('example_files/output/feature_per_intensity_histogram.png', featLogIntensityVector, width=400, height=400,   # draw a plot with 1 histogram
                                        title='#features per intensity',  ylab = '# of features', xlab = 'intensity')
    """
    # Reading in a csv file, seperated by tabs into a dataFrame. A feature.csv file contains at least a column 'id' with the id's of all features and a columne
    # 'intensity' with the intensity of each feature. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/feature.csv', sep=',') would separate on commas. Additional arguments can be found in the
    # documentation of rFunctions.readCsvFile()
    featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')
    
    # Remove all the rows from featDataframe of which the value in the column 'id' already exists. featDataframeUniq is a sub-dataframe of
    # featDataframe with only rows with unique id's.  
    featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')
        
    # Retreive all the values of the column 'intensity' from featDataframeUniq and return them as a vector. The vector featIntensityVector conatains
    # the intensities of all the unique feature id's. 
    featIntensityVector = featDataframe[rFunctions.index(featDataframeUniq, 'intensity')]
    
    # Take the log with base 10 of all the values in featIntensityVector.
    featLogIntensityVector = rFunctions.takeLog(featIntensityVector,10)

    # make an instance of rPlots.Plots()
    plots = rPlots.Plots()
    
    # using rplots.histogram to plot a histogram. First argument is the outfile, second argument is the vector for the histogram,
    # title is the title of the graph, xlab is the description that will go under the x-axis, # y-lab is the description that will go to the y-axis.
    # The first 3 positional arguments are mandatory, the keyworded arguments are given to the **kwargs of histogram and are optional. 
    # For more possible arguments see the rPlots.Plots.histogram documentation 
    plots.histogram('example_files/output/feature_per_intensity_histogram.png', featLogIntensityVector, width=400, height=400, 
                                        title='#features per intensity',  ylab = '# of features', xlab = 'intensity')
Esempio n. 2
0
def msms_per_feature_per_intensity_boxplot():
    """
    Example of how to make a boxplot of the # of MS/MS per feature per intensity.
    
    This example script uses the following functions:
      - L{rFunctions.readCsvFile}
      - L{rFunctions.getRowsWithUniqColumn}
      - L{rFunctions.index}
      - L{rFunctions.takeLog}
      - L{rFunctions.getColumns}
      - L{rPlots.Plots.boxplotFormulae}
    
        
    >>> from pyMSA import rFunctions
    >>> from pyMSA import rPlots
    >>> featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')                                                            # Read a csv file into a rpy2 dataframe with a column containingintensities
    >>> featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')                                                            # Remove the rows with redundant id's from featDataframe
    >>> precursorPerFeatureDataframe = rFunctions.readCsvFile('example_files/input/feature_precursor.csv', head=True, sep='\t')              # Read a csv file into a rpy2 dataframe with a column containing # MS/MS per feature
    >>> mergedFeatureDataframe = R.r['merge'](featDataframeUniq, precursorPerFeatureDataframe)                                               # Merge the two dataframes, so that each feature has an intensity and an #MS/MS per feature
    >>> mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe, 'intensity')] = R.r['round'](rFunctions.takeLog(featDataframeUniq[rFunctions.index(featDataframeUniq, 'intensity')], 10))    # Take the log10 and round all the values in the 'intensity' column
    >>> vector1 = mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe, 'X..precursors')]                                          # Retrieve a vector of all values in the column '# precursors'
    >>> vector2 = mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe,'intensity')]                                               # retrieve a vector of all values in the column 'intensity'
    >>> plots = rPlots.Plots()                                                                                                               # instantiate rPlots.PlotS()
    >>> plots.boxplotFormulae('example_files/output/msms_per_feature_per_intensity_boxplot.png', vector1, vector2, mergedFeatureDataframe,   # plot the boxplot
                    title = 'MS/MS per feature per intensity', ylab = '# of MS/MS per feature', xlab = 'Rounded log10 of intensity')
    """

    # Reading in a csv file, seperated by tabs into a dataFrame. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile'example_files/input/feature.csv', sep=',') would separate on commas. Additional arguments can be found in the
    # documentation of rFunctions.readCsvFile()
    featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')
    
    # Remove all the rows from featDataframe of which the value in the column 'id' already exists. featDataframeUniq is a sub-dataframe of
    # featDataframe with only rows with unique id's.  
    featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')
    
    # Reading in a csv file, seperated by tabs into a dataFrame. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/mzML.csv', sep=',') would separate on commas. Additional arguments can
    # be found in the documentation of rFunctions.readCsvFile().
    precursorPerFeatureDataframe = rFunctions.readCsvFile('example_files/input/feature_precursor.csv', head=True, sep='\t')

    # merge the precursorPerFEatureDataframe and the featDataframeUniq. Because both dataframes have a column named 'id' R's merge function will automatically append
    # the '# precursors' column values from precursorPerFeatureDataframe to the right row. That is, where the value in column 'id' is the same for both dataframes. 
    mergedFeatureDataframe = R.r['merge'](featDataframeUniq, precursorPerFeatureDataframe)

    # Take the log10 of all the values in the 'intensity' column and round them to their nearest full number (because discrete values needed for the boxplot)
    mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe, 'intensity')] = R.r['round'](rFunctions.takeLog(featDataframeUniq[rFunctions.index(featDataframeUniq, 'intensity')], 10))
    
    # Retrieves a vector of all the values in the column '# precursors' of mergedFeatureDataframe
    vector1 = mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe, 'X..precursors')]
    
    # Retrieves a vector of all the values in the column 'intensity' of mergedFeatureDataframe
    vector2 = mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe,'intensity')]

    # make an instance of rPlots.Plots()
    plots = rPlots.Plots()
   
    # using plots.boxplotFormulae to plot a boxplot. Because the boxplot is the values in the column '# precursors' per value in the column 'intensity', plots.boxplotFormulae 
    # is used instead of plots.boxplotDataframe. First argument is a the name of the ouput file. Second and third argument are 2 vectors which correspond to the x and  y 
    # explained in rpy2's robjects_formulae documentation.  http://rpy.sourceforge.net/rpy2/doc-2.2/html/robjects_formulae.html. 
    # Title is the title of the graph, xlab is the description that will go under the x-axis, y-lab is the description that will go to the y-axis 
    # The first 3 positional arguments are mandatory, the keyworded arguments are given to the **kwargs of barplot and are optional. For more possible arguments 
    # see the rPlots.Plots.barbplot documentation. 
    plots.boxplotFormulae('example_files/output/msms_per_feature_per_intensity_boxplot.png', vector1, vector2, mergedFeatureDataframe, 
                    title = 'MS/MS per feature per intensity', ylab = '# of MS/MS per feature', xlab = 'Rounded log10 of intensity')
Esempio n. 3
0
def feature_and_MSMS_per_intensity_histogram():
    """
    Example of how to make an overlapping histogram of the features and MS/MS precursors per intensity.
    
    Takes a feature.csv file and a mzml.csv file. It makes a vector out of all the intensities per feature in example_files/input/feature.csv
    and a vector of all the intensities per spectrums with ms level > 2 in example_files/input/feature.csv. It retrieves vectors of information 
    about the feature and the MS/MS, so with little tweaking this method can be used any time that you can make an n amount of vectors out 
    of your data. For better viewing the data is logged with base 10.
    
    This example script uses the following functions:
      - L{rFunctions.readCsvFile}
      - L{rFunctions.getRowsWithUniqColumn}
      - L{rFunctions.index}
      - L{rFunctions.takeLog}
      - L{rPlots.Plots.histogram}
    
     
    >>> from pyMSA import rFunctions
    >>> from pyMSA import rPlots
    >>> import rpy2.robjects as R
    >>> featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')                                         # Read a csv file into a rpy2 dataframe with featurs and intensities
    >>> featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')                                         # Remove the rows with redundant id's from featDataframe
    >>> featIntensityVector = featDataframe[rFunctions.index(featDataframeUniq, 'intensity')]                             # Retrieve a vector of intensities from the unique features
    >>> featLogIntensityVector = rFunctions.takeLog(featIntensityVector,10)                                               # Take the logarithm of all the values in the vector with base 10
    >>> mzmlDataframe = rFunctions.readCsvFile('example_files/input/mzML.csv')                                            # Read a csv file into a rpy2 dataframe with precursors and intensities
    >>> precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro >= 2, True)   # Retrieve a subset dataframe of mzmlDataframe where values in column 'ms level' > 2
    >>> mzmlIntensityVector = precursorDataframe[rFunctions.index(precursorDataframe, 'base.peak.intensity')]             # Retrieve the intensty of all the MS/MS precursors in precursorSubset
    >>> mzmlLogIntensityVector = rFunctions.takeLog(mzmlIntensityVector, 10)                                              # Take the logarithm with base 10 of all the values in mzmlIntensityVector 
    >>> rplots = rPlots.Plots()                                                                                           # Instantiate rPlots.Plots()
    >>> rplots.histogram('example_files/output/feature_and_msms_per_intensity_histogram.png', featLogIntensityVector,     # draw a plot with 2 histograms and a legend
                         mzmlLogIntensityVector, title='feature and MSMS per intensity', xlab='intensity', 
                         ylab='frequency of MS/MS and Intensity', 
                         legend= {'x':'topright','legend':R.StrVector(['features', 'MS/MS precursors']), 
                                  'lty':R.IntVector([1,1]), 'lwd':R.IntVector([2.5,2.5])})
    """

    # Reading in a csv file, seperated by tabs into a dataFrame. A feature.csv file contains at least a column 'id' with the id's of all features and a columne
    # 'intensity' with the intensity of each feature. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/feature.csv', sep=',') would separate on commas. Additional arguments can be found in the
    # documentation of rFunctions.readCsvFile()
    featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')

    # Remove all the rows from featDataframe of which the value in the column 'id' already exists. featDataframeUniq is a sub-dataframe of
    # featDataframe with only rows with unique id's.  
    featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')

    # Retreive all the values of the column 'intensity' from featDataframeUniq and return them as a vector. The vector featIntensityVector conatains
    # the intensities of all the unique feature id's. 
    featIntensityVector = featDataframe[rFunctions.index(featDataframeUniq, 'intensity')]
    
    # Take the log with base 10 of all the values in featIntensityVector.
    featLogIntensityVector = rFunctions.takeLog(featIntensityVector,10)
    
    # Reading in a csv file, seperated by tabs into a dataFrame. The .csv file contains at least a column 'ms level' with ms levels, a column 'id' with the id's of 
    # the spectra and a column 'base peak intensities' with the base peak intensities. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/mzML.csv', sep=',') would separate on commas. Additional arguments can
    # be found in the documentation of rFunctions.readCsvFile().
    mzmlDataframe = rFunctions.readCsvFile('example_files/input/mzML.csv') 
    
    # Get a subset of mzmlDataframe with only rows that have a value of 2 or higher in the 'ms level' column (the rows that have an MS/MS precursor)
    # mzmlDataframe.rx and the .ro at the are rpy2 functions. It is possible to use these because mzmlDataframe is a rpy2.robjects.DataFrame object.
    # The rFunctions.index() is used because, although R allows getting a column by name, rpy2 only allows getting columns by number.
    # rFunctions.index(mzmlDataframe,'ms.level') returns the number of the 'ms level' column (rpy2 makes a '.' out of spaces).
    # The same would have been: mzmlDataframe.rx(mzmlDataframe[0].ro >= 2, True), if 'ms level' is the first column of mzmlDataframe.
    precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro >= 2, True)
    
    # Retrieve all the values of the column 'base.peak.intensity' from precursorDataframe and return them as a vector. The vector precursorDataframe 
    # contains the intensities of all the unique MS/MS precursors (because in the previous step only 'ms level' >= 2 retrieved)
    mzmlIntensityVector = precursorDataframe[rFunctions.index(precursorDataframe, 'base.peak.intensity')]
    
    # Take the log with base 10 of all the values in mzmlIntensityVector
    mzmlLogIntensityVector = rFunctions.takeLog(mzmlIntensityVector, 10)

    # make an instance of rPlots.Plots()
    plots = rPlots.Plots()

    # using plots.histogram to plot 2 histograms in one figure. First argument is the outfile, second argument is the vector for one of the histograms,
    # third argument is the vector for the second histogram, title is the title of the graph, xlab is the description that will go under the x-axis,
    # y-lab is the description that will go to the y-axis and legend are the arguments given to make the legend. The first 3 positional arguments are 
    # mandatory, the keyworded arguments are given to the **kwargs of histogram and are optional. For more possible arguments see the rPlots.Plots.histogram 
    # documentation and R's ?legend documentation for more arguments to give to legend. 
    plots.histogram('example_files/output/feature_and_msms_per_intensity_histogram.png', featLogIntensityVector, mzmlLogIntensityVector, title='feature and MSMS per intensity', 
                    xlab='intensity', ylab='frequency of MS/MS and Intensity', 
                    legend= {'x':'topright','legend':R.StrVector(['features', 'MS/MS precursors']), 'lty':R.IntVector([1,1]), 'lwd':R.IntVector([2.5,2.5])})