def __call__(self, spectrum: SpectrumType, reference_spectrum: SpectrumType) -> bool: """Compare parent masses""" parentmass = spectrum.get("parent_mass") parentmass_ref = reference_spectrum.get("parent_mass") assert parentmass is not None and parentmass_ref is not None, "Missing parent mass." return abs(parentmass - parentmass_ref) <= self.tolerance
def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Compare precursor m/z between reference and query spectrum. Parameters ---------- reference Single reference spectrum. query Single query spectrum. """ entry_ref = reference.get(self.field) entry_query = query.get(self.field) if entry_ref is None or entry_query is None: return np.asarray(False, dtype=self.score_datatype) if self.matching_type == "equal_match": score = (entry_ref == entry_query) return np.asarray(score, dtype=self.score_datatype) if isinstance(entry_ref, (int, float)) and isinstance(entry_query, (int, float)): score = abs(entry_ref - entry_query) <= self.tolerance return np.asarray(score, dtype=self.score_datatype) logger.warning( "Non-numerical entry not compatible with 'difference' method") return np.asarray(False, dtype=self.score_datatype)
def test_require_minimum_number_of_peaks_required_4_ratio_none(spectrum_in: SpectrumType): """Test if parent_mass scaling is properly ignored when not passing ratio_required.""" spectrum_in.set("parent_mass", 100) spectrum = require_minimum_number_of_peaks(spectrum_in, n_required=4) assert spectrum == spectrum_in, "Expected the spectrum to qualify because the number of peaks (4) is equal to the" \ "required number (4)."
def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Compare parent masses between reference and query spectrum. Parameters ---------- reference Single reference spectrum. query Single query spectrum. """ parentmass_ref = reference.get("parent_mass") parentmass_query = query.get("parent_mass") assert parentmass_ref is not None and parentmass_query is not None, "Missing parent mass." return abs(parentmass_ref - parentmass_query) <= self.tolerance
def post_process_normal(spectrum_in: SpectrumType, min_peaks: int = 10) \ -> Union[SpectrumType, None]: """Normal processing of spectra for Spec2Vec Parameters ---------- spectrum_in: Input spectrum. min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) """ if spectrum_in is None: return None s = spectrum_in.clone() s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5) if s is None: return None # remove low peaks unless less than 10 peaks are left s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks # add losses to normally processed spectra s = add_losses(s, loss_mz_from=5.0, loss_mz_to=200.0) return s
def remove_precursor_mz_peak(spectrum_in: SpectrumType) -> SpectrumType: """Remove the peak for precursor_mz in the spectrum (if it exists) Parameters ---------- spectrum_in: Input spectrum. """ if spectrum_in is None: return None spectrum = spectrum_in.clone() prec_mz = spectrum.get("precursor_mz") before_len = len(spectrum.peaks) if prec_mz: # precursor_mz exists mzs, intensities = spectrum.peaks.clone() prec_mz_i = [i for i, mz in enumerate(mzs) if mz == prec_mz] if prec_mz_i: # precursor_mz peak exists -> remove it new_mzs = np.delete(mzs, prec_mz_i) new_intensities = np.delete(intensities, prec_mz_i) new_spikes = Spikes(mz=new_mzs, intensities=new_intensities) spectrum.peaks = new_spikes after_len = len(spectrum.peaks) assert after_len == before_len - 1, \ "Expected only one peak to have been removed" return spectrum
def require_peaks_below_mz(spectrum_in: SpectrumType, n_required: int = 10, max_mz: float = 1000.0) -> SpectrumType: """Spectrum will be set to None when it has fewer peaks than required. Args: ---------- spectrum_in: Input spectrum. n_required: Number of minimum required peaks. Spectra with fewer peaks will be set to 'None'. max_mz: Only peaks <= max_mz will be counted to check if spectrum contains sufficient peaks to be considered (>= n_required). """ if spectrum_in is None: return None spectrum = spectrum_in.clone() if spectrum.peaks.mz[spectrum.peaks.mz < max_mz].size < n_required: return None return spectrum
def normalize_intensities(spectrum_in: SpectrumType) -> SpectrumType: """Normalize intensities of peaks (and losses) to unit height.""" if spectrum_in is None: return None spectrum = spectrum_in.clone() if len(spectrum.peaks) == 0: return spectrum max_intensity = numpy.max(spectrum.peaks.intensities) # Normalize peak intensities mz, intensities = spectrum.peaks normalized_intensities = intensities / max_intensity spectrum.peaks = Spikes(mz=mz, intensities=normalized_intensities) # Normalize loss intensities if spectrum.losses is not None and len(spectrum.losses) > 0: mz, intensities = spectrum.losses normalized_intensities = intensities / max_intensity spectrum.losses = Spikes(mz=mz, intensities=normalized_intensities) return spectrum
def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Compare precursor m/z between reference and query spectrum. Parameters ---------- reference Single reference spectrum. query Single query spectrum. """ precursormz_ref = reference.get("precursor_mz") precursormz_query = query.get("precursor_mz") assert precursormz_ref is not None and precursormz_query is not None, "Missing precursor m/z." if self.type == "Dalton": return abs(precursormz_ref - precursormz_query) <= self.tolerance mean_mz = (precursormz_ref + precursormz_query) / 2 score = abs(precursormz_ref - precursormz_query) / mean_mz <= self.tolerance return numpy.asarray(score, dtype=self.score_datatype)
def pair(self, reference: SpectrumType, query: SpectrumType) -> float: """Calculate fingerprint based similarity score between two spectra. Parameters ---------- reference Single reference spectrum. query Single query spectrum. """ fingerprint_ref = reference.get("fingerprint") fingerprint_query = query.get("fingerprint") if self.similarity_measure == "jaccard": return jaccard_index(fingerprint_ref, fingerprint_query) if self.similarity_measure == "dice": return dice_similarity(fingerprint_ref, fingerprint_query) if self.similarity_measure == "cosine": return cosine_similarity(fingerprint_ref, fingerprint_query) raise NotImplementedError
def normalize_intensities(spectrum_in: SpectrumType) -> SpectrumType: """Normalize intensities to unit height.""" if spectrum_in is None: return None spectrum = spectrum_in.clone() if len(spectrum.peaks) > 0: scale_factor = numpy.max(spectrum.peaks.intensities) mz, intensities = spectrum.peaks normalized_intensities = intensities / scale_factor spectrum.peaks = Spikes(mz=mz, intensities=normalized_intensities) return spectrum
def get_tanimoto_for_spectrum_ids( self, query_spectrum: SpectrumType, spectra_ids_list: List[str]) -> pd.DataFrame: """Returns a dataframe with tanimoto scores Spectra in spectra_ids_list without inchikey are removed. Args: ------ query_spectrum: Single Spectrum, the tanimoto scores are calculated between this spectrum and the spectra in match_spectrum_ids. match_spectrum_ids: list of spectrum_ids, which are preselected matches of the query_spectrum """ query_inchikey14 = query_spectrum.get("inchikey")[:14] assert len(query_inchikey14) == 14, \ f"Expected inchikey of length 14, " \ f"got inchikey = {query_inchikey14}" # Get inchikeys belonging to spectra ids metadata_dict = get_metadata_from_sqlite( self.sqlite_file_name, spectra_ids_list, self.settings["spectrum_id_column_name"]) unfiltered_inchikeys = [ metadata_dict[spectrum_id]["inchikey"] for spectrum_id in spectra_ids_list ] inchikey14s_dict = {} for i, inchikey in enumerate(unfiltered_inchikeys): # Only get the first 14 characters of the inchikeys inchikey14 = inchikey[:14] spectrum_id = spectra_ids_list[i] # Don't save spectra that do not have an inchikey. If a spectra has # no inchikey it is stored as "", so it will not be stored. if len(inchikey14) == 14: inchikey14s_dict[spectrum_id] = inchikey14 tanimoto_scores_spectra_ids = pd.DataFrame( columns=["Tanimoto_score"], index=list(inchikey14s_dict.keys())) for spectrum_id, inchikey14 in inchikey14s_dict.items(): tanimoto_score = self.tanimoto_scores.loc[inchikey14, query_inchikey14] tanimoto_scores_spectra_ids.at[spectrum_id, "Tanimoto_score"] = \ tanimoto_score return tanimoto_scores_spectra_ids
def post_process_md(spectrum_in: SpectrumType, low_int_cutoff: float = 0.05, min_peaks: int = 10, max_peaks: int = 30) -> Union[SpectrumType, None]: """Processing of spectra that are used for mass difference extraction Parameters ---------- spectrum_in: Input spectrum. low_int_cutoff: Lower intensity cutoff for the peaks selected for MD min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) max_peaks: Maximum number of peaks allowed in the spectrum (ranked on intensity) """ if spectrum_in is None: return None s = spectrum_in.clone() # remove precurzor_mz from spectra so neutral losses don't end up in MDs s = remove_precursor_mz_peak(s) s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = reduce_to_number_of_peaks(s, n_required=min_peaks, ratio_desired=0.5) if s is None: return None # remove low peaks unless less than 10 peaks are left s_remove_low_peaks = select_by_relative_intensity(s, intensity_from=0.001) if len(s_remove_low_peaks.peaks) >= 10: s = s_remove_low_peaks # do an additional removal step with a different intensity cutoff s_second_peak_removal = select_by_relative_intensity( s, intensity_from=low_int_cutoff) if len(s_second_peak_removal.peaks) >= 10: s = s_second_peak_removal # reduce to top30 peaks s = reduce_to_number_of_peaks(s, n_required=min_peaks, n_max=max_peaks) return s
def post_process_classical(spectrum_in: SpectrumType, min_peaks: int = 10) \ -> Union[SpectrumType, None]: """Processing of spectra for calculating classical scores Parameters ---------- spectrum_in: Input spectrum. min_peaks: Minimum number of peaks to pass the spectrum (otherwise -> None) """ if spectrum_in is None: return None s = spectrum_in.clone() s = normalize_intensities(s) if any(np.isnan(s.peaks[1])): return None # remove spectra that have all intensities 0 s = select_by_mz(s, mz_from=0, mz_to=1000) s = require_minimum_number_of_peaks(s, n_required=min_peaks) s = select_by_relative_intensity(s, intensity_from=0.01, intensity_to=1.0) return s
def test_require_minimum_number_of_peaks_required_4_or_1_no_parent_mass(spectrum_in: SpectrumType): spectrum_in.set("parent_mass", None) spectrum = require_minimum_number_of_peaks(spectrum_in, n_required=4, ratio_required=0.1) assert spectrum == spectrum_in, "Expected the spectrum to qualify because the number of peaks (4) is equal to the" \ "required number (4)."
def test_require_minimum_number_of_peaks_required_5_or_10(spectrum_in: SpectrumType): spectrum_in.set("parent_mass", 100) spectrum = require_minimum_number_of_peaks(spectrum_in, n_required=5, ratio_required=0.1) assert spectrum is None, "Did not expect the spectrum to qualify because the number of peaks (4) is less " \ "than the required number (10)."
def get_mass_differences(spectrum_in: SpectrumType, multiply: bool = False, max_mds_per_peak: int = 30, cutoff: int = 36, n_max: int = 100) -> Union[Spikes, None]: """Returns Spikes with top X mass differences and intensities Parameters ---------- spectrum_in: Spectrum in matchms.Spectrum format multiply: Multiply parent peak intensities instead of taking the mean max_mds_per_peak: Maximum amount of MDs that can originate from one peak, ranked on intensity. The minimum is 2 (with this implementation) cutoff: Mass cutoff for mass difference (default like Xing et al.) n_max: Maximum amount of mass differences to select, ranked on intensity (default like Xing et al.) """ if spectrum_in is None: return None spectrum = spectrum_in.clone() peaks_mz_ori, peaks_intensities_ori = spectrum.peaks # sort on intensities to allow for max_mds_per_peak selection sort_idx = peaks_intensities_ori.argsort()[::-1] peaks_intensities = peaks_intensities_ori[sort_idx] peaks_mz = peaks_mz_ori[sort_idx] # for every peak, calculate MDs to all other peaks mass_diff_mz = [] mass_diff_intensities = [] used_mz_dict = {mz_val: 0 for mz_val in peaks_mz} # keep track of used mz for i, (mz_i, int_i) in enumerate(zip(peaks_mz[:-1], peaks_intensities[:-1])): cur = used_mz_dict[mz_i] # number of uses of this peak allowed = max_mds_per_peak - cur # still allowed uses for mz_j, int_j in zip(peaks_mz[i + 1:i + 1 + allowed], peaks_intensities[i + 1:i + 1 + allowed]): # update used peaks dict used_mz_dict[mz_i] += 1 used_mz_dict[mz_j] += 1 # calculate mass difference mz_diff = mz_j - mz_i if mz_diff > cutoff: mass_diff_mz.append(mz_diff) if multiply: new_intensity = int_i * int_j else: new_intensity = np.mean([int_i, int_j]) mass_diff_intensities.append(new_intensity) # sort on mz mass_diff_mz = np.array(mass_diff_mz) mass_diff_intensities = np.array(mass_diff_intensities) idx = mass_diff_intensities.argsort()[-n_max:] idx_sort_by_mz = mass_diff_mz[idx].argsort() mass_diff_peaks = Spikes( mz=mass_diff_mz[idx][idx_sort_by_mz], intensities=mass_diff_intensities[idx][idx_sort_by_mz]) return mass_diff_peaks