Beispiel #1
0
 def __call__(self, spectrum: SpectrumType,
              reference_spectrum: SpectrumType) -> bool:
     """Compare parent masses"""
     parentmass = spectrum.get("parent_mass")
     parentmass_ref = reference_spectrum.get("parent_mass")
     assert parentmass is not None and parentmass_ref is not None, "Missing parent mass."
     return abs(parentmass - parentmass_ref) <= self.tolerance
Beispiel #2
0
    def pair(self, reference: SpectrumType, query: SpectrumType) -> float:
        """Compare precursor m/z between reference and query spectrum.

        Parameters
        ----------
        reference
            Single reference spectrum.
        query
            Single query spectrum.
        """
        entry_ref = reference.get(self.field)
        entry_query = query.get(self.field)
        if entry_ref is None or entry_query is None:
            return np.asarray(False, dtype=self.score_datatype)

        if self.matching_type == "equal_match":
            score = (entry_ref == entry_query)
            return np.asarray(score, dtype=self.score_datatype)

        if isinstance(entry_ref,
                      (int, float)) and isinstance(entry_query, (int, float)):
            score = abs(entry_ref - entry_query) <= self.tolerance
            return np.asarray(score, dtype=self.score_datatype)

        logger.warning(
            "Non-numerical entry not compatible with 'difference' method")
        return np.asarray(False, dtype=self.score_datatype)
Beispiel #3
0
def test_require_minimum_number_of_peaks_required_4_ratio_none(spectrum_in: SpectrumType):
    """Test if parent_mass scaling is properly ignored when not passing ratio_required."""
    spectrum_in.set("parent_mass", 100)

    spectrum = require_minimum_number_of_peaks(spectrum_in, n_required=4)

    assert spectrum == spectrum_in, "Expected the spectrum to qualify because the number of peaks (4) is equal to the" \
                                    "required number (4)."
Beispiel #4
0
    def pair(self, reference: SpectrumType, query: SpectrumType) -> float:
        """Compare parent masses between reference and query spectrum.

        Parameters
        ----------
        reference
            Single reference spectrum.
        query
            Single query spectrum.
        """
        parentmass_ref = reference.get("parent_mass")
        parentmass_query = query.get("parent_mass")
        assert parentmass_ref is not None and parentmass_query is not None, "Missing parent mass."

        return abs(parentmass_ref - parentmass_query) <= self.tolerance
def post_process_normal(spectrum_in: SpectrumType, min_peaks: int = 10) \
        -> Union[SpectrumType, None]:
    """Normal processing of spectra for Spec2Vec

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    min_peaks:
        Minimum number of peaks to pass the spectrum (otherwise -> None)
    """
    if spectrum_in is None:
        return None

    s = spectrum_in.clone()
    s = normalize_intensities(s)
    if any(np.isnan(s.peaks[1])):
        return None  # remove spectra that have all intensities 0
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5)
    if s is None:
        return None
    # remove low peaks unless less than 10 peaks are left
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
    # add losses to normally processed spectra
    s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0)
    return s
def remove_precursor_mz_peak(spectrum_in: SpectrumType) -> SpectrumType:
    """Remove the peak for precursor_mz in the spectrum (if it exists)

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    """
    if spectrum_in is None:
        return None

    spectrum = spectrum_in.clone()

    prec_mz = spectrum.get("precursor_mz")
    before_len = len(spectrum.peaks)
    if prec_mz:  # precursor_mz exists
        mzs, intensities = spectrum.peaks.clone()
        prec_mz_i = [i for i, mz in enumerate(mzs) if mz == prec_mz]
        if prec_mz_i:  # precursor_mz peak exists -> remove it
            new_mzs = np.delete(mzs, prec_mz_i)
            new_intensities = np.delete(intensities, prec_mz_i)
            new_spikes = Spikes(mz=new_mzs, intensities=new_intensities)
            spectrum.peaks = new_spikes
            after_len = len(spectrum.peaks)
            assert after_len == before_len - 1, \
                "Expected only one peak to have been removed"

    return spectrum
Beispiel #7
0
def require_peaks_below_mz(spectrum_in: SpectrumType,
                           n_required: int = 10,
                           max_mz: float = 1000.0) -> SpectrumType:
    """Spectrum will be set to None when it has fewer peaks than required.

    Args:
    ----------
    spectrum_in:
        Input spectrum.
    n_required:
        Number of minimum required peaks. Spectra with fewer peaks will be set
        to 'None'.
    max_mz:
        Only peaks <= max_mz will be counted to check if spectrum contains
        sufficient peaks to be considered (>= n_required).
    """
    if spectrum_in is None:
        return None

    spectrum = spectrum_in.clone()

    if spectrum.peaks.mz[spectrum.peaks.mz < max_mz].size < n_required:
        return None

    return spectrum
Beispiel #8
0
def normalize_intensities(spectrum_in: SpectrumType) -> SpectrumType:
    """Normalize intensities of peaks (and losses) to unit height."""

    if spectrum_in is None:
        return None

    spectrum = spectrum_in.clone()

    if len(spectrum.peaks) == 0:
        return spectrum

    max_intensity = numpy.max(spectrum.peaks.intensities)

    # Normalize peak intensities
    mz, intensities = spectrum.peaks
    normalized_intensities = intensities / max_intensity
    spectrum.peaks = Spikes(mz=mz, intensities=normalized_intensities)

    # Normalize loss intensities
    if spectrum.losses is not None and len(spectrum.losses) > 0:
        mz, intensities = spectrum.losses
        normalized_intensities = intensities / max_intensity
        spectrum.losses = Spikes(mz=mz, intensities=normalized_intensities)

    return spectrum
Beispiel #9
0
    def pair(self, reference: SpectrumType, query: SpectrumType) -> float:
        """Compare precursor m/z between reference and query spectrum.

        Parameters
        ----------
        reference
            Single reference spectrum.
        query
            Single query spectrum.
        """
        precursormz_ref = reference.get("precursor_mz")
        precursormz_query = query.get("precursor_mz")
        assert precursormz_ref is not None and precursormz_query is not None, "Missing precursor m/z."

        if self.type == "Dalton":
            return abs(precursormz_ref - precursormz_query) <= self.tolerance

        mean_mz = (precursormz_ref + precursormz_query) / 2
        score = abs(precursormz_ref -
                    precursormz_query) / mean_mz <= self.tolerance
        return numpy.asarray(score, dtype=self.score_datatype)
    def pair(self, reference: SpectrumType, query: SpectrumType) -> float:
        """Calculate fingerprint based similarity score between two spectra.

        Parameters
        ----------
        reference
            Single reference spectrum.
        query
            Single query spectrum.
        """
        fingerprint_ref = reference.get("fingerprint")
        fingerprint_query = query.get("fingerprint")
        if self.similarity_measure == "jaccard":
            return jaccard_index(fingerprint_ref, fingerprint_query)

        if self.similarity_measure == "dice":
            return dice_similarity(fingerprint_ref, fingerprint_query)

        if self.similarity_measure == "cosine":
            return cosine_similarity(fingerprint_ref, fingerprint_query)

        raise NotImplementedError
def normalize_intensities(spectrum_in: SpectrumType) -> SpectrumType:
    """Normalize intensities to unit height."""

    if spectrum_in is None:
        return None

    spectrum = spectrum_in.clone()

    if len(spectrum.peaks) > 0:
        scale_factor = numpy.max(spectrum.peaks.intensities)
        mz, intensities = spectrum.peaks
        normalized_intensities = intensities / scale_factor
        spectrum.peaks = Spikes(mz=mz, intensities=normalized_intensities)

    return spectrum
Beispiel #12
0
    def get_tanimoto_for_spectrum_ids(
            self, query_spectrum: SpectrumType,
            spectra_ids_list: List[str]) -> pd.DataFrame:
        """Returns a dataframe with tanimoto scores

        Spectra in spectra_ids_list without inchikey are removed.
        Args:
        ------
        query_spectrum:
            Single Spectrum, the tanimoto scores are calculated between this
            spectrum and the spectra in match_spectrum_ids.
        match_spectrum_ids:
            list of spectrum_ids, which are preselected matches of the
            query_spectrum
        """
        query_inchikey14 = query_spectrum.get("inchikey")[:14]
        assert len(query_inchikey14) == 14, \
            f"Expected inchikey of length 14, " \
            f"got inchikey = {query_inchikey14}"

        # Get inchikeys belonging to spectra ids
        metadata_dict = get_metadata_from_sqlite(
            self.sqlite_file_name, spectra_ids_list,
            self.settings["spectrum_id_column_name"])
        unfiltered_inchikeys = [
            metadata_dict[spectrum_id]["inchikey"]
            for spectrum_id in spectra_ids_list
        ]

        inchikey14s_dict = {}
        for i, inchikey in enumerate(unfiltered_inchikeys):
            # Only get the first 14 characters of the inchikeys
            inchikey14 = inchikey[:14]
            spectrum_id = spectra_ids_list[i]
            # Don't save spectra that do not have an inchikey. If a spectra has
            # no inchikey it is stored as "", so it will not be stored.
            if len(inchikey14) == 14:
                inchikey14s_dict[spectrum_id] = inchikey14

        tanimoto_scores_spectra_ids = pd.DataFrame(
            columns=["Tanimoto_score"], index=list(inchikey14s_dict.keys()))
        for spectrum_id, inchikey14 in inchikey14s_dict.items():
            tanimoto_score = self.tanimoto_scores.loc[inchikey14,
                                                      query_inchikey14]
            tanimoto_scores_spectra_ids.at[spectrum_id, "Tanimoto_score"] = \
                tanimoto_score
        return tanimoto_scores_spectra_ids
def post_process_md(spectrum_in: SpectrumType,
                    low_int_cutoff: float = 0.05,
                    min_peaks: int = 10,
                    max_peaks: int = 30) -> Union[SpectrumType, None]:
    """Processing of spectra that are used for mass difference extraction

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    low_int_cutoff:
        Lower intensity cutoff for the peaks selected for MD
    min_peaks:
        Minimum number of peaks to pass the spectrum (otherwise -> None)
    max_peaks:
        Maximum number of peaks allowed in the spectrum (ranked on intensity)
    """
    if spectrum_in is None:
        return None

    s = spectrum_in.clone()
    # remove precurzor_mz from spectra so neutral losses don't end up in MDs
    s = remove_precursor_mz_peak(s)
    s = normalize_intensities(s)
    if any(np.isnan(s.peaks[1])):
        return None  # remove spectra that have all intensities 0
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5)
    if s is None:
        return None
    # remove low peaks unless less than 10 peaks are left
    s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001)
    if len(s_remove_low_peaks.peaks) >= 10:
        s = s_remove_low_peaks
    # do an additional removal step with a different intensity cutoff
    s_second_peak_removal = select_by_relative_intensity(
        s, intensity_from=low_int_cutoff)
    if len(s_second_peak_removal.peaks) >= 10:
        s = s_second_peak_removal

    # reduce to top30 peaks
    s = reduce_to_number_of_peaks(s, n_required=min_peaks, n_max=max_peaks)
    return s
def post_process_classical(spectrum_in: SpectrumType, min_peaks: int = 10) \
        -> Union[SpectrumType, None]:
    """Processing of spectra for calculating classical scores

    Parameters
    ----------
    spectrum_in:
        Input spectrum.
    min_peaks:
        Minimum number of peaks to pass the spectrum (otherwise -> None)
    """
    if spectrum_in is None:
        return None

    s = spectrum_in.clone()
    s = normalize_intensities(s)
    if any(np.isnan(s.peaks[1])):
        return None  # remove spectra that have all intensities 0
    s = select_by_mz(s, mz_from=0, mz_to=1000)
    s = require_minimum_number_of_peaks(s, n_required=min_peaks)
    s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0)
    return s
Beispiel #15
0
def test_require_minimum_number_of_peaks_required_4_or_1_no_parent_mass(spectrum_in: SpectrumType):
    spectrum_in.set("parent_mass", None)
    spectrum = require_minimum_number_of_peaks(spectrum_in, n_required=4, ratio_required=0.1)

    assert spectrum == spectrum_in, "Expected the spectrum to qualify because the number of peaks (4) is equal to the" \
                                    "required number (4)."
Beispiel #16
0
def test_require_minimum_number_of_peaks_required_5_or_10(spectrum_in: SpectrumType):
    spectrum_in.set("parent_mass", 100)
    spectrum = require_minimum_number_of_peaks(spectrum_in, n_required=5, ratio_required=0.1)

    assert spectrum is None, "Did not expect the spectrum to qualify because the number of peaks (4) is less " \
                             "than the required number (10)."
Beispiel #17
0
def get_mass_differences(spectrum_in: SpectrumType,
                         multiply: bool = False,
                         max_mds_per_peak: int = 30,
                         cutoff: int = 36,
                         n_max: int = 100) -> Union[Spikes, None]:
    """Returns Spikes with top X mass differences and intensities

    Parameters
    ----------
    spectrum_in:
        Spectrum in matchms.Spectrum format
    multiply:
        Multiply parent peak intensities instead of taking the mean
    max_mds_per_peak:
        Maximum amount of MDs that can originate from one peak, ranked on
        intensity. The minimum is 2 (with this implementation)
    cutoff:
        Mass cutoff for mass difference (default like Xing et al.)
    n_max:
        Maximum amount of mass differences to select, ranked on intensity
        (default like Xing et al.)
    """
    if spectrum_in is None:
        return None

    spectrum = spectrum_in.clone()
    peaks_mz_ori, peaks_intensities_ori = spectrum.peaks
    # sort on intensities to allow for max_mds_per_peak selection
    sort_idx = peaks_intensities_ori.argsort()[::-1]
    peaks_intensities = peaks_intensities_ori[sort_idx]
    peaks_mz = peaks_mz_ori[sort_idx]

    # for every peak, calculate MDs to all other peaks
    mass_diff_mz = []
    mass_diff_intensities = []
    used_mz_dict = {mz_val: 0 for mz_val in peaks_mz}  # keep track of used mz
    for i, (mz_i,
            int_i) in enumerate(zip(peaks_mz[:-1], peaks_intensities[:-1])):
        cur = used_mz_dict[mz_i]  # number of uses of this peak
        allowed = max_mds_per_peak - cur  # still allowed uses
        for mz_j, int_j in zip(peaks_mz[i + 1:i + 1 + allowed],
                               peaks_intensities[i + 1:i + 1 + allowed]):
            # update used peaks dict
            used_mz_dict[mz_i] += 1
            used_mz_dict[mz_j] += 1
            # calculate mass difference
            mz_diff = mz_j - mz_i
            if mz_diff > cutoff:
                mass_diff_mz.append(mz_diff)
                if multiply:
                    new_intensity = int_i * int_j
                else:
                    new_intensity = np.mean([int_i, int_j])
                mass_diff_intensities.append(new_intensity)
    # sort on mz
    mass_diff_mz = np.array(mass_diff_mz)
    mass_diff_intensities = np.array(mass_diff_intensities)
    idx = mass_diff_intensities.argsort()[-n_max:]
    idx_sort_by_mz = mass_diff_mz[idx].argsort()
    mass_diff_peaks = Spikes(
        mz=mass_diff_mz[idx][idx_sort_by_mz],
        intensities=mass_diff_intensities[idx][idx_sort_by_mz])
    return mass_diff_peaks