def post_process_normal(spectrum_in: SpectrumType, min_peaks: int = 10) \ -> Union[SpectrumType, None]: """Normal processing of spectra for Spec2Vec Parameters ---------- spectrum_in: Input spectrum. min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) """ if spectrum_in is None: return None s = spectrum_in.clone() s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5) if s is None: return None # remove low peaks unless less than 10 peaks are left s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks # add losses to normally processed spectra s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0) return s
def apply_my_filters(s): s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = select_by_relative_intensity(s, intensity_from=0.0, intensity_to=1.0) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=5) return s
def test_select_by_mz(peaks, mz_from, mz_to, expected): spectrum_in = SpectrumBuilder().with_mz(peaks[0]).with_intensities( peaks[1]).build() spectrum = select_by_mz(spectrum_in, mz_from=mz_from, mz_to=mz_to) assert spectrum.peaks.mz.size == len(expected[0]) assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size assert numpy.array_equal(spectrum.peaks.mz, expected[0]) assert numpy.array_equal(spectrum.peaks.intensities, expected[1])
def apply_my_filters(s): """This is how a user would typically design his own pre- and post- processing pipeline.""" s = default_filters(s) s = add_parent_mass(s) s = normalize_intensities(s) s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5) s = select_by_mz(s, mz_from=0, mz_to=1000) s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s
def spectrum_processing(s): """This is how one would typically design a desired pre- and post- processing pipeline.""" s = default_filters(s) s = add_precursor_mz(s) s = normalize_intensities(s) s = reduce_to_number_of_peaks(s, n_required=5, ratio_desired=0.5, n_max=500) s = select_by_mz(s, mz_from=0, mz_to=1000) s = add_losses(s, loss_mz_from=10.0, loss_mz_to=200.0) s = require_minimum_number_of_peaks(s, n_required=5) return s
def test_select_by_mz_with_from_and_to_parameters(): mz = numpy.array([10, 20, 30, 40], dtype="float") intensities = numpy.array([1, 10, 100, 1000], dtype="float") spectrum_in = Spectrum(mz=mz, intensities=intensities) spectrum = select_by_mz(spectrum_in, mz_from=15.0, mz_to=35.0) assert spectrum.peaks.mz.size == 2 assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size assert numpy.array_equal(spectrum.peaks.mz, numpy.array([20, 30], dtype="float")) assert numpy.array_equal(spectrum.peaks.intensities, numpy.array([10, 100], dtype="float"))
def test_select_by_mz_no_parameters_1(): mz = numpy.array([10, 20, 30, 40], dtype="float") intensities = numpy.array([1, 10, 100, 1000], dtype="float") spectrum_in = Spectrum(mz=mz, intensities=intensities, metadata=dict()) spectrum = select_by_mz(spectrum_in) assert spectrum.peaks.mz.size == 4 assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size assert numpy.array_equal(spectrum.peaks.mz, numpy.array([10, 20, 30, 40], dtype="float")) assert numpy.array_equal(spectrum.peaks.intensities, numpy.array([1, 10, 100, 1000], dtype="float"))
def test_select_by_mz_no_parameters_2(): mz = numpy.array([998, 999, 1000, 1001, 1002], dtype="float") intensities = numpy.array([1, 10, 100, 1000, 10000], dtype="float") spectrum_in = Spectrum(mz=mz, intensities=intensities) spectrum = select_by_mz(spectrum_in) assert spectrum.peaks.mz.size == 3 assert spectrum.peaks.mz.size == spectrum.peaks.intensities.size assert numpy.array_equal(spectrum.peaks.mz, numpy.array([998, 999, 1000], dtype="float")) assert numpy.array_equal(spectrum.peaks.intensities, numpy.array([1, 10, 100], dtype="float"))
def post_process(s): s = normalize_intensities(s) s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=10) try: s = reduce_to_number_of_peaks(s, n_required=10, ratio_desired=0.5) except: pass if s is None: return None s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0) return s
def post_process_md(spectrum_in: SpectrumType, low_int_cutoff: float = 0.05, min_peaks: int = 10, max_peaks: int = 30) -> Union[SpectrumType, None]: """Processing of spectra that are used for mass difference extraction Parameters ---------- spectrum_in: Input spectrum. low_int_cutoff: Lower intensity cutoff for the peaks selected for MD min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) max_peaks: Maximum number of peaks allowed in the spectrum (ranked on intensity) """ if spectrum_in is None: return None s = spectrum_in.clone() # remove precurzor_mz from spectra so neutral losses don't end up in MDs s = remove_precursor_mz_peak(s) s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5) if s is None: return None # remove low peaks unless less than 10 peaks are left s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks # do an additional removal step with a different intensity cutoff s_second_peak_removal = select_by_relative_intensity( s, intensity_from=low_int_cutoff) if len(s_second_peak_removal.peaks) >= 10: s = s_second_peak_removal # reduce to top30 peaks s = reduce_to_number_of_peaks(s, n_required=min_peaks, n_max=max_peaks) return s
def post_process_classical(spectrum_in: SpectrumType, min_peaks: int = 10) \ -> Union[SpectrumType, None]: """Processing of spectra for calculating classical scores Parameters ---------- spectrum_in: Input spectrum. min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) """ if spectrum_in is None: return None s = spectrum_in.clone() s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0) return s
def spectrum_processing_minimal( spectrum: SpectrumType, **settings: Union[int, float]) -> Union[SpectrumType, None]: """Minimal necessary spectrum processing that is required by MS2Query. This mostly includes intensity normalization and setting spectra to None when they do not meet the minimum requirements. Args: ---------- spectrum: Spectrum to process mz_from Set lower threshold for m/z peak positions. Default is 10.0. n_required_below_mz Number of minimal required peaks with m/z below 1000.0Da for a spectrum to be considered. Spectra not meeting this criteria will be set to None. intensity_from Set lower threshold for peak intensity. Default is 0.001. max_mz_required Only peaks <= max_mz_required will be counted to check if spectrum contains sufficient peaks to be considered. """ settings = set_minimal_processing_defaults(**settings) spectrum = normalize_intensities(spectrum) spectrum = select_by_intensity(spectrum, intensity_from=settings["intensity_from"]) spectrum = select_by_mz(spectrum, mz_from=settings["mz_from"], mz_to=np.inf) spectrum = require_peaks_below_mz( spectrum, n_required=settings["n_required_below_mz"], max_mz=settings["max_mz_required"]) spectrum = require_precursor_mz(spectrum) return spectrum
def spectrum_processing_s2v( spectrum: SpectrumType, **settings: Union[int, float]) -> Union[SpectrumType]: """Spectrum processing required for computing Spec2Vec scores. Args: ---------- spectrum: Spectrum to process mz_from: Peaks below this value are removed. Default = 10.0 mz_to: Peaks above this value are removed. Default = 1000.0 n_required Number of minimal required peaks for a spectrum to be considered. n_max Maximum number of peaks to be kept per spectrum. Default is 1000. loss_mz_from Minimum allowed m/z value for losses. Default is 0.0. loss_mz_to Maximum allowed m/z value for losses. Default is 1000.0. """ settings = set_spec2vec_defaults(**settings) spectrum = select_by_mz(spectrum, mz_from=settings["mz_from"], mz_to=settings["mz_to"]) spectrum = reduce_to_number_of_peaks(spectrum, n_required=settings["n_required"], n_max=settings["n_max"]) spectrum = add_losses(spectrum, loss_mz_from=settings["loss_mz_from"], loss_mz_to=settings["loss_mz_to"]) assert spectrum is not None, \ "Expects Spectrum that has high enough quality and is not None" return spectrum
def apply_filters(s): s = normalize_intensities(s) s = select_by_mz(s, mz_from=min_mz, mz_to=max_mz) s = select_by_relative_intensity(s, intensity_from=intensity_threshold) s.losses = None return s
def main(argv): parser = argparse.ArgumentParser( description="Compute MSP similarity scores") parser.add_argument("--spectra", type=str, required=True, help="Mass spectra file to be filtered.") parser.add_argument("--spectra_format", type=str, required=True, help="Format of spectra file.") parser.add_argument("--output", type=str, required=True, help="Filtered mass spectra file.") parser.add_argument( "-normalise_intensities", action='store_true', help="Normalize intensities of peaks (and losses) to unit height.") parser.add_argument( "-default_filters", action='store_true', help= "Collection of filters that are considered default and that do no require any (factory) arguments." ) parser.add_argument( "-clean_metadata", action='store_true', help= "Apply all adding and cleaning filters if possible, so that the spectra have canonical metadata." ) parser.add_argument( "-relative_intensity", action='store_true', help= "Keep only peaks within set relative intensity range (keep if to_intensity >= intensity >= from_intensity)." ) parser.add_argument("--from_intensity", type=float, help="Lower bound for intensity filter") parser.add_argument("--to_intensity", type=float, help="Upper bound for intensity filter") parser.add_argument( "-mz_range", action='store_true', help= "Keep only peaks between set m/z range (keep if to_mz >= m/z >= from_mz)." ) parser.add_argument("--from_mz", type=float, help="Lower bound for m/z filter") parser.add_argument("--to_mz", type=float, help="Upper bound for m/z filter") args = parser.parse_args() if not (args.normalise_intensities or args.default_filters or args.clean_metadata or args.relative_intensity or args.mz_range): raise ValueError('No filter selected.') if args.spectra_format == 'msp': spectra = list(load_from_msp(args.spectra)) elif args.queries_format == 'mgf': spectra = list(load_from_mgf(args.spectra)) else: raise ValueError( f'File format {args.spectra_format} not supported for mass spectra file.' ) filtered_spectra = [] for spectrum in spectra: if args.normalise_intensities: spectrum = normalize_intensities(spectrum) if args.default_filters: spectrum = default_filters(spectrum) if args.clean_metadata: filters = [ add_compound_name, add_precursor_mz, add_fingerprint, add_losses, add_parent_mass, add_retention_index, add_retention_time, clean_compound_name ] for metadata_filter in filters: spectrum = metadata_filter(spectrum) if args.relative_intensity: spectrum = select_by_relative_intensity(spectrum, args.from_intensity, args.to_intensity) if args.mz_range: spectrum = select_by_mz(spectrum, args.from_mz, args.to_mz) filtered_spectra.append(spectrum) if args.spectra_format == 'msp': save_as_msp(filtered_spectra, args.output) else: save_as_mgf(filtered_spectra, args.output) return 0