Ejemplo n.º 1
0
def post_process_normal(spectrum_in: SpectrumType, min_peaks: int = 10) \
        -> Union[SpectrumType, None]:
    """Normal processing of spectra for Spec2Vec

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    min_peaks:
        Minimum number of peaks to pass the spectrum (otherwise -> None)
    """
    if spectrum_in is None:
        return None

    s = spectrum_in.clone()
    s = normalize_intensities(s)
    if any(np.isnan(s.peaks[1])):
        return None  # remove spectra that have all intensities 0
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5)
    if s is None:
        return None
    # remove low peaks unless less than 10 peaks are left
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
    # add losses to normally processed spectra
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s
Ejemplo n.º 2
0
 def apply_my_filters(s):
     s = default_filters(s)
     s = add_parent_mass(s)
     s = normalize_intensities(s)
     s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0)
     s = select_by_mz(s, mz_from=0, mz_to=1000)
     s = require_minimum_number_of_peaks(s, n_required=5)
     return s
Ejemplo n.º 3
0
def test_select_by_mz(peaks, mz_from, mz_to, expected):
    spectrum_in = SpectrumBuilder().with_mz(peaks[0]).with_intensities(
        peaks[1]).build()
    spectrum = select_by_mz(spectrum_in, mz_from=mz_from, mz_to=mz_to)

    assert spectrum.peaks.mz.size == len(expected[0])
    assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size
    assert numpy.array_equal(spectrum.peaks.mz, expected[0])
    assert numpy.array_equal(spectrum.peaks.intensities, expected[1])
Ejemplo n.º 4
0
 def apply_my_filters(s):
     """This is how a user would typically design his own pre- and post-
     processing pipeline."""
     s = default_filters(s)
     s = add_parent_mass(s)
     s = normalize_intensities(s)
     s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
     s = select_by_mz(s, mz_from=0, mz_to=1000)
     s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
     s = require_minimum_number_of_peaks(s, n_required=5)
     return s
Ejemplo n.º 5
0
def spectrum_processing(s):
    """This is how one would typically design a desired pre- and post-
    processing pipeline."""
    s = default_filters(s)
    s = add_precursor_mz(s)
    s = normalize_intensities(s)
    s = reduce_to_number_of_peaks(s, n_required=5, ratio_desired=0.5, n_max=500)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0)
    s = require_minimum_number_of_peaks(s, n_required=5)
    return s
Ejemplo n.º 6
0
def test_select_by_mz_with_from_and_to_parameters():

    mz = numpy.array([10, 20, 30, 40], dtype="float")
    intensities = numpy.array([1, 10, 100, 1000], dtype="float")
    spectrum_in = Spectrum(mz=mz, intensities=intensities)

    spectrum = select_by_mz(spectrum_in, mz_from=15.0, mz_to=35.0)

    assert spectrum.peaks.mz.size == 2
    assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size
    assert numpy.array_equal(spectrum.peaks.mz, numpy.array([20, 30], dtype="float"))
    assert numpy.array_equal(spectrum.peaks.intensities, numpy.array([10, 100], dtype="float"))
Ejemplo n.º 7
0
def test_select_by_mz_no_parameters_1():

    mz = numpy.array([10, 20, 30, 40], dtype="float")
    intensities = numpy.array([1, 10, 100, 1000], dtype="float")
    spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=dict())

    spectrum = select_by_mz(spectrum_in)

    assert spectrum.peaks.mz.size == 4
    assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size
    assert numpy.array_equal(spectrum.peaks.mz, numpy.array([10, 20, 30, 40], dtype="float"))
    assert numpy.array_equal(spectrum.peaks.intensities, numpy.array([1, 10, 100, 1000], dtype="float"))
Ejemplo n.º 8
0
def test_select_by_mz_no_parameters_2():

    mz = numpy.array([998, 999, 1000, 1001, 1002], dtype="float")
    intensities = numpy.array([1, 10, 100, 1000, 10000], dtype="float")
    spectrum_in = Spectrum(mz=mz, intensities=intensities)

    spectrum = select_by_mz(spectrum_in)

    assert spectrum.peaks.mz.size == 3
    assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size
    assert numpy.array_equal(spectrum.peaks.mz, numpy.array([998, 999, 1000], dtype="float"))
    assert numpy.array_equal(spectrum.peaks.intensities, numpy.array([1, 10, 100], dtype="float"))
Ejemplo n.º 9
0
def post_process(s):
    s = normalize_intensities(s)
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=10)
    try:
        s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5)
    except:
        pass
    if s is None:
        return None
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks

    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s
Ejemplo n.º 10
0
def post_process_md(spectrum_in: SpectrumType,
                    low_int_cutoff: float = 0.05,
                    min_peaks: int = 10,
                    max_peaks: int = 30) -> Union[SpectrumType, None]:
    """Processing of spectra that are used for mass difference extraction

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    low_int_cutoff:
        Lower intensity cutoff for the peaks selected for MD
    min_peaks:
        Minimum number of peaks to pass the spectrum (otherwise -> None)
    max_peaks:
        Maximum number of peaks allowed in the spectrum (ranked on intensity)
    """
    if spectrum_in is None:
        return None

    s = spectrum_in.clone()
    # remove precurzor_mz from spectra so neutral losses don't end up in MDs
    s = remove_precursor_mz_peak(s)
    s = normalize_intensities(s)
    if any(np.isnan(s.peaks[1])):
        return None  # remove spectra that have all intensities 0
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5)
    if s is None:
        return None
    # remove low peaks unless less than 10 peaks are left
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
    # do an additional removal step with a different intensity cutoff
    s_second_peak_removal = select_by_relative_intensity(
        s, intensity_from=low_int_cutoff)
    if len(s_second_peak_removal.peaks) >= 10:
        s = s_second_peak_removal

    # reduce to top30 peaks
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, n_max=max_peaks)
    return s
Ejemplo n.º 11
0
def post_process_classical(spectrum_in: SpectrumType, min_peaks: int = 10) \
        -> Union[SpectrumType, None]:
    """Processing of spectra for calculating classical scores

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    min_peaks:
        Minimum number of peaks to pass the spectrum (otherwise -> None)
    """
    if spectrum_in is None:
        return None

    s = spectrum_in.clone()
    s = normalize_intensities(s)
    if any(np.isnan(s.peaks[1])):
        return None  # remove spectra that have all intensities 0
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0)
    return s
Ejemplo n.º 12
0
def spectrum_processing_minimal(
        spectrum: SpectrumType,
        **settings: Union[int, float]) -> Union[SpectrumType, None]:
    """Minimal necessary spectrum processing that is required by MS2Query.
    This mostly includes intensity normalization and setting spectra to None
    when they do not meet the minimum requirements.

    Args:
    ----------
    spectrum:
        Spectrum to process
    mz_from
        Set lower threshold for m/z peak positions. Default is 10.0.
    n_required_below_mz
        Number of minimal required peaks with m/z below 1000.0Da for a spectrum
        to be considered.
        Spectra not meeting this criteria will be set to None.
    intensity_from
        Set lower threshold for peak intensity. Default is 0.001.
    max_mz_required
        Only peaks <= max_mz_required will be counted to check if spectrum
        contains sufficient peaks to be considered.
    """
    settings = set_minimal_processing_defaults(**settings)
    spectrum = normalize_intensities(spectrum)
    spectrum = select_by_intensity(spectrum,
                                   intensity_from=settings["intensity_from"])
    spectrum = select_by_mz(spectrum,
                            mz_from=settings["mz_from"],
                            mz_to=np.inf)
    spectrum = require_peaks_below_mz(
        spectrum,
        n_required=settings["n_required_below_mz"],
        max_mz=settings["max_mz_required"])
    spectrum = require_precursor_mz(spectrum)
    return spectrum
Ejemplo n.º 13
0
def spectrum_processing_s2v(
        spectrum: SpectrumType,
        **settings: Union[int, float]) -> Union[SpectrumType]:
    """Spectrum processing required for computing Spec2Vec scores.

    Args:
    ----------
    spectrum:
        Spectrum to process
    mz_from:
        Peaks below this value are removed. Default = 10.0
    mz_to:
        Peaks above this value are removed. Default = 1000.0
    n_required
        Number of minimal required peaks for a spectrum to be considered.
    n_max
        Maximum number of peaks to be kept per spectrum. Default is 1000.
    loss_mz_from
        Minimum allowed m/z value for losses. Default is 0.0.
    loss_mz_to
        Maximum allowed m/z value for losses. Default is 1000.0.
    """
    settings = set_spec2vec_defaults(**settings)
    spectrum = select_by_mz(spectrum,
                            mz_from=settings["mz_from"],
                            mz_to=settings["mz_to"])
    spectrum = reduce_to_number_of_peaks(spectrum,
                                         n_required=settings["n_required"],
                                         n_max=settings["n_max"])

    spectrum = add_losses(spectrum,
                          loss_mz_from=settings["loss_mz_from"],
                          loss_mz_to=settings["loss_mz_to"])
    assert spectrum is not None, \
        "Expects Spectrum that has high enough quality and is not None"
    return spectrum
Ejemplo n.º 14
0
 def apply_filters(s):
     s = normalize_intensities(s)
     s = select_by_mz(s, mz_from=min_mz, mz_to=max_mz)
     s = select_by_relative_intensity(s, intensity_from=intensity_threshold)
     s.losses = None
     return s
Ejemplo n.º 15
0
def main(argv):
    parser = argparse.ArgumentParser(
        description="Compute MSP similarity scores")
    parser.add_argument("--spectra",
                        type=str,
                        required=True,
                        help="Mass spectra file to be filtered.")
    parser.add_argument("--spectra_format",
                        type=str,
                        required=True,
                        help="Format of spectra file.")
    parser.add_argument("--output",
                        type=str,
                        required=True,
                        help="Filtered mass spectra file.")
    parser.add_argument(
        "-normalise_intensities",
        action='store_true',
        help="Normalize intensities of peaks (and losses) to unit height.")
    parser.add_argument(
        "-default_filters",
        action='store_true',
        help=
        "Collection of filters that are considered default and that do no require any (factory) arguments."
    )
    parser.add_argument(
        "-clean_metadata",
        action='store_true',
        help=
        "Apply all adding and cleaning filters if possible, so that the spectra have canonical metadata."
    )
    parser.add_argument(
        "-relative_intensity",
        action='store_true',
        help=
        "Keep only peaks within set relative intensity range (keep if to_intensity >= intensity >= from_intensity)."
    )
    parser.add_argument("--from_intensity",
                        type=float,
                        help="Lower bound for intensity filter")
    parser.add_argument("--to_intensity",
                        type=float,
                        help="Upper bound for intensity filter")
    parser.add_argument(
        "-mz_range",
        action='store_true',
        help=
        "Keep only peaks between set m/z range (keep if to_mz >= m/z >= from_mz)."
    )
    parser.add_argument("--from_mz",
                        type=float,
                        help="Lower bound for m/z  filter")
    parser.add_argument("--to_mz",
                        type=float,
                        help="Upper bound for m/z  filter")
    args = parser.parse_args()

    if not (args.normalise_intensities or args.default_filters or
            args.clean_metadata or args.relative_intensity or args.mz_range):
        raise ValueError('No filter selected.')

    if args.spectra_format == 'msp':
        spectra = list(load_from_msp(args.spectra))
    elif args.queries_format == 'mgf':
        spectra = list(load_from_mgf(args.spectra))
    else:
        raise ValueError(
            f'File format {args.spectra_format} not supported for mass spectra file.'
        )

    filtered_spectra = []
    for spectrum in spectra:
        if args.normalise_intensities:
            spectrum = normalize_intensities(spectrum)

        if args.default_filters:
            spectrum = default_filters(spectrum)

        if args.clean_metadata:
            filters = [
                add_compound_name, add_precursor_mz, add_fingerprint,
                add_losses, add_parent_mass, add_retention_index,
                add_retention_time, clean_compound_name
            ]
            for metadata_filter in filters:
                spectrum = metadata_filter(spectrum)

        if args.relative_intensity:
            spectrum = select_by_relative_intensity(spectrum,
                                                    args.from_intensity,
                                                    args.to_intensity)

        if args.mz_range:
            spectrum = select_by_mz(spectrum, args.from_mz, args.to_mz)

        filtered_spectra.append(spectrum)

    if args.spectra_format == 'msp':
        save_as_msp(filtered_spectra, args.output)
    else:
        save_as_mgf(filtered_spectra, args.output)

    return 0